source: branches/3128_Prediction_Intervals/HeuristicLab.Algorithms.DataAnalysis.Glmnet/3.4/ElasticNetLinearRegression.cs @ 17991

Last change on this file since 17991 was 17991, checked in by gkronber, 5 months ago

#3128: first dump of exploratory work-in-progress code to make sure the working copy is not lost.

File size: 21.8 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using System.Threading;
26using HeuristicLab.Analysis;
27using HeuristicLab.Common;
28using HeuristicLab.Core;
29using HeuristicLab.Data;
30using HeuristicLab.Optimization;
31using HeuristicLab.Parameters;
32using HEAL.Attic;
33using HeuristicLab.Problems.DataAnalysis;
34using HeuristicLab.Problems.DataAnalysis.Symbolic;
35using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;
36using HeuristicLab.Analysis.Statistics;
37
38namespace HeuristicLab.Algorithms.DataAnalysis.Glmnet {
39  [Item("Elastic-net Linear Regression (LR)", "Linear regression with elastic-net regularization (wrapper for glmnet)")]
40  [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 110)]
41  [StorableType("529EDD40-91F3-4F3E-929F-852A3EF9B02B")]
42  public sealed class ElasticNetLinearRegression : FixedDataAnalysisAlgorithm<IRegressionProblem> {
43    private const string PenalityParameterName = "Penality";
44    private const string LambdaParameterName = "Lambda";
45    #region parameters
46    public IFixedValueParameter<DoubleValue> PenalityParameter {
47      get { return (IFixedValueParameter<DoubleValue>)Parameters[PenalityParameterName]; }
48    }
49    public IValueParameter<DoubleValue> LambdaParameter {
50      get { return (IValueParameter<DoubleValue>)Parameters[LambdaParameterName]; }
51    }
52    #endregion
53    #region properties
54    public double Penality {
55      get { return PenalityParameter.Value.Value; }
56      set { PenalityParameter.Value.Value = value; }
57    }
58    public DoubleValue Lambda {
59      get { return LambdaParameter.Value; }
60      set { LambdaParameter.Value = value; }
61    }
62    #endregion
63
64    [StorableConstructor]
65    private ElasticNetLinearRegression(StorableConstructorFlag _) : base(_) { }
66    private ElasticNetLinearRegression(ElasticNetLinearRegression original, Cloner cloner)
67      : base(original, cloner) {
68    }
69    public ElasticNetLinearRegression()
70      : base() {
71      Problem = new RegressionProblem();
72      Parameters.Add(new FixedValueParameter<DoubleValue>(PenalityParameterName, "Penalty factor (alpha) for balancing between ridge (0.0) and lasso (1.0) regression", new DoubleValue(0.5)));
73      Parameters.Add(new OptionalValueParameter<DoubleValue>(LambdaParameterName, "Optional: the value of lambda for which to calculate an elastic-net solution. lambda == null => calculate the whole path of all lambdas"));
74    }
75
76    [StorableHook(HookType.AfterDeserialization)]
77    private void AfterDeserialization() { }
78
79    public override IDeepCloneable Clone(Cloner cloner) {
80      return new ElasticNetLinearRegression(this, cloner);
81    }
82
83    protected override void Run(CancellationToken cancellationToken) {
84      if (Lambda == null) {
85        CreateSolutionPath();
86      } else {
87        CreateSolution(Lambda.Value);
88      }
89    }
90
91    private void CreateSolution(double lambda) {
92      double trainNMSE;
93      double testNMSE;
94      var coeff = CalculateModelCoefficients(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE);
95      Results.Add(new Result("NMSE (train)", new DoubleValue(trainNMSE)));
96      Results.Add(new Result("NMSE (test)", new DoubleValue(testNMSE)));
97
98      var solution = CreateSymbolicSolution(coeff, Problem.ProblemData);
99      Results.Add(new Result(solution.Name, solution.Description, solution));
100
101      var xy = Problem.ProblemData.Dataset.ToArray(Problem.ProblemData.AllowedInputVariables.Concat(new[] { Problem.ProblemData.TargetVariable }), Problem.ProblemData.TrainingIndices);
102      // prepare xy for calculation of parameter statistics
103      // the last coefficient is the offset
104      var resid = new double[xy.GetLength(0)];
105      for (int r = 0; r < xy.GetLength(0); r++) {
106        resid[r] = xy[r, coeff.Length - 1] - coeff[coeff.Length - 1];
107        xy[r, coeff.Length - 1] = 1.0;
108      }
109      var statistics = Statistics.CalculateParameterStatistics(xy, coeff, resid);
110      Results.AddOrUpdateResult("Statistics", statistics.AsResultCollection(Problem.ProblemData.AllowedInputVariables.Concat(new string[] { "<const>" })));
111    }
112
113    public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData) {
114      var ds = problemData.Dataset;
115      var allVariables = problemData.AllowedInputVariables.ToArray();
116      var doubleVariables = allVariables.Where(ds.VariableHasType<double>);
117      var factorVariableNames = allVariables.Where(ds.VariableHasType<string>);
118      var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
119
120      List<KeyValuePair<string, IEnumerable<string>>> remainingFactorVariablesAndValues = new List<KeyValuePair<string, IEnumerable<string>>>();
121      List<double> factorCoeff = new List<double>();
122      List<string> remainingDoubleVariables = new List<string>();
123      List<double> doubleVarCoeff = new List<double>();
124
125      {
126        int i = 0;
127        // find factor varibles & value combinations with non-zero coeff
128        foreach (var factorVarAndValues in factorVariablesAndValues) {
129          var l = new List<string>();
130          foreach (var factorValue in factorVarAndValues.Value) {
131            if (!coeff[i].IsAlmost(0.0)) {
132              l.Add(factorValue);
133              factorCoeff.Add(coeff[i]);
134            }
135            i++;
136          }
137          if (l.Any()) remainingFactorVariablesAndValues.Add(new KeyValuePair<string, IEnumerable<string>>(factorVarAndValues.Key, l));
138        }
139        // find double variables with non-zero coeff
140        foreach (var doubleVar in doubleVariables) {
141          if (!coeff[i].IsAlmost(0.0)) {
142            remainingDoubleVariables.Add(doubleVar);
143            doubleVarCoeff.Add(coeff[i]);
144          }
145          i++;
146        }
147      }
148      var tree = LinearModelToTreeConverter.CreateTree(
149        remainingFactorVariablesAndValues, factorCoeff.ToArray(),
150        remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(),
151        coeff.Last());
152
153
154      SymbolicRegressionSolution solution = new SymbolicRegressionSolution(
155        new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()),
156        (IRegressionProblemData)problemData.Clone());
157      solution.Model.Name = "Elastic-net Linear Regression Model";
158      solution.Name = "Elastic-net Linear Regression Solution";
159
160      return solution;
161    }
162
163    private void CreateSolutionPath() {
164      double[] lambda;
165      double[] trainNMSE;
166      double[] testNMSE;
167      double[,] coeff;
168      double[] intercept;
169      RunElasticNetLinearRegression(Problem.ProblemData, Penality, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept);
170
171      var coeffTable = new IndexedDataTable<double>("Coefficients", "The paths of standarized coefficient values over different lambda values");
172      coeffTable.VisualProperties.YAxisMaximumAuto = false;
173      coeffTable.VisualProperties.YAxisMinimumAuto = false;
174      coeffTable.VisualProperties.XAxisMaximumAuto = false;
175      coeffTable.VisualProperties.XAxisMinimumAuto = false;
176
177      coeffTable.VisualProperties.XAxisLogScale = true;
178      coeffTable.VisualProperties.XAxisTitle = "Lambda";
179      coeffTable.VisualProperties.YAxisTitle = "Coefficients";
180      coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables";
181
182      var nLambdas = lambda.Length;
183      var nCoeff = coeff.GetLength(1);
184      var dataRows = new IndexedDataRow<double>[nCoeff];
185      var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray();
186      var numNonZeroCoeffs = new int[nLambdas];
187
188      var ds = Problem.ProblemData.Dataset;
189      var doubleVariables = allowedVars.Where(ds.VariableHasType<double>);
190      var factorVariableNames = allowedVars.Where(ds.VariableHasType<string>);
191      var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
192      {
193        int i = 0;
194        foreach (var factorVariableAndValues in factorVariablesAndValues) {
195          foreach (var factorValue in factorVariableAndValues.Value) {
196            double sigma = ds.GetStringValues(factorVariableAndValues.Key)
197              .Select(s => s == factorValue ? 1.0 : 0.0)
198              .StandardDeviation(); // calc std dev of binary indicator
199            var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
200            dataRows[i] = new IndexedDataRow<double>(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path);
201            i++;
202          }
203        }
204
205        foreach (var doubleVariable in doubleVariables) {
206          double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation();
207          var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
208          dataRows[i] = new IndexedDataRow<double>(doubleVariable, doubleVariable, path);
209          i++;
210        }
211        // add to coeffTable by total weight (larger area under the curve => more important);
212        foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) {
213          coeffTable.Rows.Add(r);
214        }
215      }
216
217      for (int i = 0; i < coeff.GetLength(0); i++) {
218        for (int j = 0; j < coeff.GetLength(1); j++) {
219          if (!coeff[i, j].IsAlmost(0.0)) {
220            numNonZeroCoeffs[i]++;
221          }
222        }
223      }
224      if (lambda.Length > 2) {
225        coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
226        coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
227      }
228      coeffTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
229      coeffTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
230      coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
231
232      Results.Add(new Result(coeffTable.Name, coeffTable.Description, coeffTable));
233
234      var errorTable = new IndexedDataTable<double>("NMSE", "Path of NMSE values over different lambda values");
235      errorTable.VisualProperties.YAxisMaximumAuto = false;
236      errorTable.VisualProperties.YAxisMinimumAuto = false;
237      errorTable.VisualProperties.XAxisMaximumAuto = false;
238      errorTable.VisualProperties.XAxisMinimumAuto = false;
239
240      errorTable.VisualProperties.YAxisMinimumFixedValue = 0;
241      errorTable.VisualProperties.YAxisMaximumFixedValue = 1.0;
242      errorTable.VisualProperties.XAxisLogScale = true;
243      errorTable.VisualProperties.XAxisTitle = "Lambda";
244      errorTable.VisualProperties.YAxisTitle = "Normalized mean of squared errors (NMSE)";
245      errorTable.VisualProperties.SecondYAxisTitle = "Number of variables";
246      errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (train)", "Path of NMSE values over different lambda values", lambda.Zip(trainNMSE, (l, v) => Tuple.Create(l, v))));
247      errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (test)", "Path of NMSE values over different lambda values", lambda.Zip(testNMSE, (l, v) => Tuple.Create(l, v))));
248      errorTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
249      if (lambda.Length > 2) {
250        errorTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
251        errorTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
252      }
253      errorTable.Rows["NMSE (train)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
254      errorTable.Rows["NMSE (test)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
255      errorTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
256      errorTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
257
258      Results.Add(new Result(errorTable.Name, errorTable.Description, errorTable));
259    }
260
261    public static double[] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double lambda,
262            out double trainNMSE, out double testNMSE,
263            double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) {
264      double[] trainNMSEs;
265      double[] testNMSEs;
266      // run for exactly one lambda
267      var coeffs = CalculateModelCoefficients(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound);
268      trainNMSE = trainNMSEs[0];
269      testNMSE = testNMSEs[0];
270      return coeffs[0];
271    }
272    public static double[][] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double[] lambda,
273            out double[] trainNMSEs, out double[] testNMSEs,
274            double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
275            int maxVars = -1) {
276      // run for multiple user-supplied lambdas
277      double[,] coeff;
278      double[] intercept;
279      RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainNMSEs, out testNMSEs, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
280
281      int nRows = intercept.Length;
282      int nCols = coeff.GetLength(1) + 1;
283      double[][] sols = new double[nRows][];
284      for (int solIdx = 0; solIdx < nRows; solIdx++) {
285        sols[solIdx] = new double[nCols];
286        for (int cIdx = 0; cIdx < nCols - 1; cIdx++) {
287          sols[solIdx][cIdx] = coeff[solIdx, cIdx];
288        }
289        sols[solIdx][nCols - 1] = intercept[solIdx];
290      }
291      return sols;
292    }
293
294    public static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
295      out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
296      double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
297      int maxVars = -1
298      ) {
299      double[] userLambda = new double[0];
300      // automatically determine lambda values (maximum 100 different lambda values)
301      RunElasticNetLinearRegression(problemData, penalty, 100, 0.0, userLambda, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
302    }
303
304    /// <summary>
305    /// Elastic net with squared-error-loss for dense predictor matrix, runs the full path of all lambdas
306    /// </summary>
307    /// <param name="problemData">Predictor target matrix x and target vector y</param>
308    /// <param name="penalty">Penalty for balance between ridge (0.0) and lasso (1.0) regression</param>
309    /// <param name="nlam">Maximum number of lambda values (default 100)</param>
310    /// <param name="flmin">User control of lambda values (&lt;1.0 => minimum lambda = flmin * (largest lambda value), >= 1.0 => use supplied lambda values</param>
311    /// <param name="ulam">User supplied lambda values</param>
312    /// <param name="lambda">Output lambda values</param>
313    /// <param name="trainNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the training set for each set of coefficients along the path</param>
314    /// <param name="testNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the test set for each set of coefficients along the path</param>
315    /// <param name="coeff">Vector of coefficient vectors for each solution along the path</param>
316    /// <param name="intercept">Vector of intercepts for each solution along the path</param>
317    /// <param name="coeffLowerBound">Optional lower bound for all coefficients</param>
318    /// <param name="coeffUpperBound">Optional upper bound for all coefficients</param>
319    /// <param name="maxVars">Maximum allowed number of variables in each solution along the path (-1 => all variables are allowed)</param>
320    private static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
321  int nlam, double flmin, double[] ulam, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
322  double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
323  int maxVars = -1
324  ) {
325      if (penalty < 0.0 || penalty > 1.0) throw new ArgumentException("0 <= penalty <= 1", "penalty");
326
327      double[,] trainX;
328      double[,] testX;
329      double[] trainY;
330      double[] testY;
331
332      PrepareData(problemData, out trainX, out trainY, out testX, out testY);
333      var numTrainObs = trainX.GetLength(1);
334      var numTestObs = testX.GetLength(1);
335      var numVars = trainX.GetLength(0);
336
337      int ka = 1; // => covariance updating algorithm
338      double parm = penalty;
339      double[] w = Enumerable.Repeat(1.0, numTrainObs).ToArray(); // all observations have the same weight
340      int[] jd = new int[1]; // do not force to use any of the variables
341      double[] vp = Enumerable.Repeat(1.0, numVars).ToArray(); // all predictor variables are unpenalized
342      double[,] cl = new double[numVars, 2]; // use the same bounds for all coefficients
343      for (int i = 0; i < numVars; i++) {
344        cl[i, 0] = coeffLowerBound;
345        cl[i, 1] = coeffUpperBound;
346      }
347
348      int ne = maxVars > 0 ? maxVars : numVars;
349      int nx = numVars;
350      double thr = 1.0e-5; // default value as recommended in glmnet
351      int isd = 1; //  => regression on standardized predictor variables
352      int intr = 1;  // => do include intercept in model
353      int maxit = 100000; // default value as recommended in glmnet
354      // outputs
355      int lmu = -1;
356      double[,] ca;
357      int[] ia;
358      int[] nin;
359      int nlp = -99;
360      int jerr = -99;
361      double[] trainR2;
362      Glmnet.elnet(ka, parm, numTrainObs, numVars, trainX, trainY, w, jd, vp, cl, ne, nx, nlam, flmin, ulam, thr, isd, intr, maxit, out lmu, out intercept, out ca, out ia, out nin, out trainR2, out lambda, out nlp, out jerr);
363
364      trainNMSE = new double[lmu]; // elnet returns R**2 as 1 - NMSE
365      testNMSE = new double[lmu];
366      coeff = new double[lmu, numVars];
367      for (int solIdx = 0; solIdx < lmu; solIdx++) {
368        trainNMSE[solIdx] = 1.0 - trainR2[solIdx];
369
370        // uncompress coefficients of solution
371        int selectedNin = nin[solIdx];
372        double[] coefficients;
373        double[] selectedCa = new double[nx];
374        for (int i = 0; i < nx; i++) {
375          selectedCa[i] = ca[solIdx, i];
376        }
377
378        // apply to test set to calculate test NMSE values for each lambda step
379        double[] fn;
380        Glmnet.modval(intercept[solIdx], selectedCa, ia, selectedNin, numTestObs, testX, out fn);
381        OnlineCalculatorError error;
382        var nmse = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(testY, fn, out error);
383        if (error != OnlineCalculatorError.None) nmse = double.NaN;
384        testNMSE[solIdx] = nmse;
385
386        // uncompress coefficients
387        Glmnet.uncomp(numVars, selectedCa, ia, selectedNin, out coefficients);
388        for (int i = 0; i < coefficients.Length; i++) {
389          coeff[solIdx, i] = coefficients[i];
390        }
391      }
392    }
393
394    private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY,
395      out double[,] testX, out double[] testY) {
396      var ds = problemData.Dataset;
397      var targetVariable = problemData.TargetVariable;
398      var allowedInputs = problemData.AllowedInputVariables;
399      trainX = PrepareInputData(ds, allowedInputs, problemData.TrainingIndices);
400      trainY = ds.GetDoubleValues(targetVariable, problemData.TrainingIndices).ToArray();
401
402      testX = PrepareInputData(ds, allowedInputs, problemData.TestIndices);
403      testY = ds.GetDoubleValues(targetVariable, problemData.TestIndices).ToArray();
404    }
405
406    private static double[,] PrepareInputData(IDataset ds, IEnumerable<string> allowedInputs, IEnumerable<int> rows) {
407      var doubleVariables = allowedInputs.Where(ds.VariableHasType<double>);
408      var factorVariableNames = allowedInputs.Where(ds.VariableHasType<string>);
409      var factorVariables = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
410      double[,] binaryMatrix = ds.ToArray(factorVariables, rows);
411      double[,] doubleVarMatrix = ds.ToArray(doubleVariables, rows);
412      var x = binaryMatrix.HorzCat(doubleVarMatrix);
413      return x.Transpose();
414    }
415  }
416}
Note: See TracBrowser for help on using the repository browser.