Free cookie consent management tool by TermsFeed Policy Generator

source: stable/HeuristicLab.Algorithms.DataAnalysis.Glmnet/3.4/ElasticNetLinearRegression.cs @ 17991

Last change on this file since 17991 was 17181, checked in by swagner, 5 years ago

#2875: Merged r17180 from trunk to stable

File size: 21.0 KB
RevLine 
[14370]1#region License Information
2/* HeuristicLab
[17181]3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[14370]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[15147]23using System.Collections.Generic;
[13927]24using System.Linq;
[14674]25using System.Threading;
[13927]26using HeuristicLab.Analysis;
27using HeuristicLab.Common;
28using HeuristicLab.Core;
29using HeuristicLab.Data;
30using HeuristicLab.Optimization;
31using HeuristicLab.Parameters;
[17097]32using HEAL.Attic;
[13927]33using HeuristicLab.Problems.DataAnalysis;
[13930]34using HeuristicLab.Problems.DataAnalysis.Symbolic;
35using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;
[13927]36
[14225]37namespace HeuristicLab.Algorithms.DataAnalysis.Glmnet {
[13927]38  [Item("Elastic-net Linear Regression (LR)", "Linear regression with elastic-net regularization (wrapper for glmnet)")]
39  [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 110)]
[17097]40  [StorableType("529EDD40-91F3-4F3E-929F-852A3EF9B02B")]
[13927]41  public sealed class ElasticNetLinearRegression : FixedDataAnalysisAlgorithm<IRegressionProblem> {
42    private const string PenalityParameterName = "Penality";
[14395]43    private const string LambdaParameterName = "Lambda";
[13927]44    #region parameters
45    public IFixedValueParameter<DoubleValue> PenalityParameter {
46      get { return (IFixedValueParameter<DoubleValue>)Parameters[PenalityParameterName]; }
47    }
[14395]48    public IValueParameter<DoubleValue> LambdaParameter {
49      get { return (IValueParameter<DoubleValue>)Parameters[LambdaParameterName]; }
[13930]50    }
[13927]51    #endregion
52    #region properties
53    public double Penality {
54      get { return PenalityParameter.Value.Value; }
55      set { PenalityParameter.Value.Value = value; }
56    }
[14395]57    public DoubleValue Lambda {
58      get { return LambdaParameter.Value; }
59      set { LambdaParameter.Value = value; }
[13930]60    }
[13927]61    #endregion
62
63    [StorableConstructor]
[17097]64    private ElasticNetLinearRegression(StorableConstructorFlag _) : base(_) { }
[13927]65    private ElasticNetLinearRegression(ElasticNetLinearRegression original, Cloner cloner)
66      : base(original, cloner) {
67    }
[13940]68    public ElasticNetLinearRegression()
69      : base() {
[13927]70      Problem = new RegressionProblem();
[14377]71      Parameters.Add(new FixedValueParameter<DoubleValue>(PenalityParameterName, "Penalty factor (alpha) for balancing between ridge (0.0) and lasso (1.0) regression", new DoubleValue(0.5)));
[14395]72      Parameters.Add(new OptionalValueParameter<DoubleValue>(LambdaParameterName, "Optional: the value of lambda for which to calculate an elastic-net solution. lambda == null => calculate the whole path of all lambdas"));
[13927]73    }
74
75    [StorableHook(HookType.AfterDeserialization)]
76    private void AfterDeserialization() { }
77
78    public override IDeepCloneable Clone(Cloner cloner) {
79      return new ElasticNetLinearRegression(this, cloner);
80    }
81
[14674]82    protected override void Run(CancellationToken cancellationToken) {
[14395]83      if (Lambda == null) {
[13930]84        CreateSolutionPath();
85      } else {
[14395]86        CreateSolution(Lambda.Value);
[13930]87      }
88    }
89
[15147]90  private void CreateSolution(double lambda) {
[14225]91      double trainNMSE;
92      double testNMSE;
[15147]93      var coeff = CalculateModelCoefficients(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE);
[14225]94      Results.Add(new Result("NMSE (train)", new DoubleValue(trainNMSE)));
95      Results.Add(new Result("NMSE (test)", new DoubleValue(testNMSE)));
[13930]96
[15147]97      var solution = CreateSymbolicSolution(coeff, Problem.ProblemData);
98      Results.Add(new Result(solution.Name, solution.Description, solution));
99    }
[13930]100
[15147]101    public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData) {
102      var ds = problemData.Dataset;
103      var allVariables = problemData.AllowedInputVariables.ToArray();
104      var doubleVariables = allVariables.Where(ds.VariableHasType<double>);
105      var factorVariableNames = allVariables.Where(ds.VariableHasType<string>);
106      var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
[13930]107
[15147]108      List<KeyValuePair<string, IEnumerable<string>>> remainingFactorVariablesAndValues = new List<KeyValuePair<string, IEnumerable<string>>>();
109      List<double> factorCoeff = new List<double>();
110      List<string> remainingDoubleVariables = new List<string>();
111      List<double> doubleVarCoeff = new List<double>();
[13930]112
[15147]113      {
114        int i = 0;
115        // find factor varibles & value combinations with non-zero coeff
116        foreach (var factorVarAndValues in factorVariablesAndValues) {
117          var l = new List<string>();
118          foreach (var factorValue in factorVarAndValues.Value) {
119            if (!coeff[i].IsAlmost(0.0)) {
120              l.Add(factorValue);
121              factorCoeff.Add(coeff[i]);
122            }
123            i++;
124          }
125          if (l.Any()) remainingFactorVariablesAndValues.Add(new KeyValuePair<string, IEnumerable<string>>(factorVarAndValues.Key, l));
126        }
127        // find double variables with non-zero coeff
128        foreach (var doubleVar in doubleVariables) {
129          if (!coeff[i].IsAlmost(0.0)) {
130            remainingDoubleVariables.Add(doubleVar);
131            doubleVarCoeff.Add(coeff[i]);
132          }
133          i++;
134        }
135      }
136      var tree = LinearModelToTreeConverter.CreateTree(
137        remainingFactorVariablesAndValues, factorCoeff.ToArray(),
138        remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(),
139        coeff.Last());
[14395]140
[15147]141
[13961]142      SymbolicRegressionSolution solution = new SymbolicRegressionSolution(
[15147]143        new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()),
144        (IRegressionProblemData)problemData.Clone());
[13930]145      solution.Model.Name = "Elastic-net Linear Regression Model";
146      solution.Name = "Elastic-net Linear Regression Solution";
147
[15147]148      return solution;
[13930]149    }
150
151    private void CreateSolutionPath() {
[13927]152      double[] lambda;
[14225]153      double[] trainNMSE;
154      double[] testNMSE;
[13927]155      double[,] coeff;
156      double[] intercept;
[14225]157      RunElasticNetLinearRegression(Problem.ProblemData, Penality, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept);
[13927]158
[14375]159      var coeffTable = new IndexedDataTable<double>("Coefficients", "The paths of standarized coefficient values over different lambda values");
[14374]160      coeffTable.VisualProperties.YAxisMaximumAuto = false;
161      coeffTable.VisualProperties.YAxisMinimumAuto = false;
162      coeffTable.VisualProperties.XAxisMaximumAuto = false;
163      coeffTable.VisualProperties.XAxisMinimumAuto = false;
164
165      coeffTable.VisualProperties.XAxisLogScale = true;
[14395]166      coeffTable.VisualProperties.XAxisTitle = "Lambda";
[14375]167      coeffTable.VisualProperties.YAxisTitle = "Coefficients";
[14374]168      coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables";
169
[13927]170      var nLambdas = lambda.Length;
171      var nCoeff = coeff.GetLength(1);
[14374]172      var dataRows = new IndexedDataRow<double>[nCoeff];
[13928]173      var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray();
[14373]174      var numNonZeroCoeffs = new int[nLambdas];
[15147]175
176      var ds = Problem.ProblemData.Dataset;
177      var doubleVariables = allowedVars.Where(ds.VariableHasType<double>);
178      var factorVariableNames = allowedVars.Where(ds.VariableHasType<string>);
179      var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
180      {
181        int i = 0;
182        foreach (var factorVariableAndValues in factorVariablesAndValues) {
183          foreach (var factorValue in factorVariableAndValues.Value) {
184            double sigma = ds.GetStringValues(factorVariableAndValues.Key)
185              .Select(s => s == factorValue ? 1.0 : 0.0)
186              .StandardDeviation(); // calc std dev of binary indicator
187            var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
188            dataRows[i] = new IndexedDataRow<double>(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path);
189            i++;
190          }
191        }
192
193        foreach (var doubleVariable in doubleVariables) {
194          double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation();
195          var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
196          dataRows[i] = new IndexedDataRow<double>(doubleVariable, doubleVariable, path);
197          i++;
198        }
199        // add to coeffTable by total weight (larger area under the curve => more important);
200        foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) {
201          coeffTable.Rows.Add(r);
202        }
[13927]203      }
[14844]204
[14373]205      for (int i = 0; i < coeff.GetLength(0); i++) {
206        for (int j = 0; j < coeff.GetLength(1); j++) {
207          if (!coeff[i, j].IsAlmost(0.0)) {
208            numNonZeroCoeffs[i]++;
209          }
210        }
211      }
[14374]212      if (lambda.Length > 2) {
213        coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
214        coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
215      }
216      coeffTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
217      coeffTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
218      coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
[13927]219
220      Results.Add(new Result(coeffTable.Name, coeffTable.Description, coeffTable));
221
[14375]222      var errorTable = new IndexedDataTable<double>("NMSE", "Path of NMSE values over different lambda values");
223      errorTable.VisualProperties.YAxisMaximumAuto = false;
224      errorTable.VisualProperties.YAxisMinimumAuto = false;
225      errorTable.VisualProperties.XAxisMaximumAuto = false;
226      errorTable.VisualProperties.XAxisMinimumAuto = false;
[13940]227
[14375]228      errorTable.VisualProperties.YAxisMinimumFixedValue = 0;
229      errorTable.VisualProperties.YAxisMaximumFixedValue = 1.0;
230      errorTable.VisualProperties.XAxisLogScale = true;
[14395]231      errorTable.VisualProperties.XAxisTitle = "Lambda";
[14375]232      errorTable.VisualProperties.YAxisTitle = "Normalized mean of squared errors (NMSE)";
[14395]233      errorTable.VisualProperties.SecondYAxisTitle = "Number of variables";
[14375]234      errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (train)", "Path of NMSE values over different lambda values", lambda.Zip(trainNMSE, (l, v) => Tuple.Create(l, v))));
235      errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (test)", "Path of NMSE values over different lambda values", lambda.Zip(testNMSE, (l, v) => Tuple.Create(l, v))));
236      errorTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
[13940]237      if (lambda.Length > 2) {
[14375]238        errorTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
239        errorTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
[13940]240      }
[14375]241      errorTable.Rows["NMSE (train)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
242      errorTable.Rows["NMSE (test)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
243      errorTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
244      errorTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
[14395]245
[14375]246      Results.Add(new Result(errorTable.Name, errorTable.Description, errorTable));
[13927]247    }
248
[15147]249    public static double[] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double lambda,
[14225]250            out double trainNMSE, out double testNMSE,
[13927]251            double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) {
[14225]252      double[] trainNMSEs;
253      double[] testNMSEs;
[13927]254      // run for exactly one lambda
[15147]255      var coeffs = CalculateModelCoefficients(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound);
[14225]256      trainNMSE = trainNMSEs[0];
257      testNMSE = testNMSEs[0];
[13927]258      return coeffs[0];
259    }
[15147]260    public static double[][] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double[] lambda,
[14370]261            out double[] trainNMSEs, out double[] testNMSEs,
[13927]262            double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
263            int maxVars = -1) {
264      // run for multiple user-supplied lambdas
265      double[,] coeff;
266      double[] intercept;
[14370]267      RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainNMSEs, out testNMSEs, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
[13927]268
269      int nRows = intercept.Length;
270      int nCols = coeff.GetLength(1) + 1;
271      double[][] sols = new double[nRows][];
272      for (int solIdx = 0; solIdx < nRows; solIdx++) {
273        sols[solIdx] = new double[nCols];
274        for (int cIdx = 0; cIdx < nCols - 1; cIdx++) {
275          sols[solIdx][cIdx] = coeff[solIdx, cIdx];
276        }
277        sols[solIdx][nCols - 1] = intercept[solIdx];
278      }
279      return sols;
280    }
281
282    public static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
[14225]283      out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
[13927]284      double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
285      int maxVars = -1
286      ) {
287      double[] userLambda = new double[0];
288      // automatically determine lambda values (maximum 100 different lambda values)
[14225]289      RunElasticNetLinearRegression(problemData, penalty, 100, 0.0, userLambda, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
[13927]290    }
291
292    /// <summary>
293    /// Elastic net with squared-error-loss for dense predictor matrix, runs the full path of all lambdas
294    /// </summary>
295    /// <param name="problemData">Predictor target matrix x and target vector y</param>
296    /// <param name="penalty">Penalty for balance between ridge (0.0) and lasso (1.0) regression</param>
297    /// <param name="nlam">Maximum number of lambda values (default 100)</param>
298    /// <param name="flmin">User control of lambda values (&lt;1.0 => minimum lambda = flmin * (largest lambda value), >= 1.0 => use supplied lambda values</param>
299    /// <param name="ulam">User supplied lambda values</param>
300    /// <param name="lambda">Output lambda values</param>
[14225]301    /// <param name="trainNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the training set for each set of coefficients along the path</param>
302    /// <param name="testNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the test set for each set of coefficients along the path</param>
[13927]303    /// <param name="coeff">Vector of coefficient vectors for each solution along the path</param>
304    /// <param name="intercept">Vector of intercepts for each solution along the path</param>
305    /// <param name="coeffLowerBound">Optional lower bound for all coefficients</param>
306    /// <param name="coeffUpperBound">Optional upper bound for all coefficients</param>
307    /// <param name="maxVars">Maximum allowed number of variables in each solution along the path (-1 => all variables are allowed)</param>
308    private static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
[14225]309  int nlam, double flmin, double[] ulam, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
[13927]310  double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
311  int maxVars = -1
312  ) {
313      if (penalty < 0.0 || penalty > 1.0) throw new ArgumentException("0 <= penalty <= 1", "penalty");
314
[13929]315      double[,] trainX;
316      double[,] testX;
317      double[] trainY;
318      double[] testY;
[13927]319
[14395]320      PrepareData(problemData, out trainX, out trainY, out testX, out testY);
321      var numTrainObs = trainX.GetLength(1);
322      var numTestObs = testX.GetLength(1);
323      var numVars = trainX.GetLength(0);
324
[13927]325      int ka = 1; // => covariance updating algorithm
326      double parm = penalty;
[13929]327      double[] w = Enumerable.Repeat(1.0, numTrainObs).ToArray(); // all observations have the same weight
[13927]328      int[] jd = new int[1]; // do not force to use any of the variables
329      double[] vp = Enumerable.Repeat(1.0, numVars).ToArray(); // all predictor variables are unpenalized
330      double[,] cl = new double[numVars, 2]; // use the same bounds for all coefficients
331      for (int i = 0; i < numVars; i++) {
332        cl[i, 0] = coeffLowerBound;
333        cl[i, 1] = coeffUpperBound;
334      }
335
336      int ne = maxVars > 0 ? maxVars : numVars;
337      int nx = numVars;
338      double thr = 1.0e-5; // default value as recommended in glmnet
[13940]339      int isd = 1; //  => regression on standardized predictor variables
[13927]340      int intr = 1;  // => do include intercept in model
341      int maxit = 100000; // default value as recommended in glmnet
342      // outputs
343      int lmu = -1;
344      double[,] ca;
345      int[] ia;
346      int[] nin;
347      int nlp = -99;
348      int jerr = -99;
[14225]349      double[] trainR2;
[14370]350      Glmnet.elnet(ka, parm, numTrainObs, numVars, trainX, trainY, w, jd, vp, cl, ne, nx, nlam, flmin, ulam, thr, isd, intr, maxit, out lmu, out intercept, out ca, out ia, out nin, out trainR2, out lambda, out nlp, out jerr);
[13927]351
[14225]352      trainNMSE = new double[lmu]; // elnet returns R**2 as 1 - NMSE
353      testNMSE = new double[lmu];
[13927]354      coeff = new double[lmu, numVars];
355      for (int solIdx = 0; solIdx < lmu; solIdx++) {
[14225]356        trainNMSE[solIdx] = 1.0 - trainR2[solIdx];
357
[13927]358        // uncompress coefficients of solution
359        int selectedNin = nin[solIdx];
360        double[] coefficients;
361        double[] selectedCa = new double[nx];
[13929]362        for (int i = 0; i < nx; i++) {
363          selectedCa[i] = ca[solIdx, i];
364        }
[13927]365
[14225]366        // apply to test set to calculate test NMSE values for each lambda step
[13929]367        double[] fn;
[14370]368        Glmnet.modval(intercept[solIdx], selectedCa, ia, selectedNin, numTestObs, testX, out fn);
[13929]369        OnlineCalculatorError error;
[14225]370        var nmse = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(testY, fn, out error);
[14461]371        if (error != OnlineCalculatorError.None) nmse = double.NaN;
[14225]372        testNMSE[solIdx] = nmse;
[13929]373
374        // uncompress coefficients
[14370]375        Glmnet.uncomp(numVars, selectedCa, ia, selectedNin, out coefficients);
[13927]376        for (int i = 0; i < coefficients.Length; i++) {
377          coeff[solIdx, i] = coefficients[i];
378        }
379      }
380    }
381
[14395]382    private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY,
383      out double[,] testX, out double[] testY) {
[15147]384      var ds = problemData.Dataset;
385      var targetVariable = problemData.TargetVariable;
386      var allowedInputs = problemData.AllowedInputVariables;
387      trainX = PrepareInputData(ds, allowedInputs, problemData.TrainingIndices);
388      trainY = ds.GetDoubleValues(targetVariable, problemData.TrainingIndices).ToArray();
[13927]389
[15147]390      testX = PrepareInputData(ds, allowedInputs, problemData.TestIndices);
391      testY = ds.GetDoubleValues(targetVariable, problemData.TestIndices).ToArray();
[13927]392    }
[15147]393
394    private static double[,] PrepareInputData(IDataset ds, IEnumerable<string> allowedInputs, IEnumerable<int> rows) {
395      var doubleVariables = allowedInputs.Where(ds.VariableHasType<double>);
396      var factorVariableNames = allowedInputs.Where(ds.VariableHasType<string>);
397      var factorVariables = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
398      double[,] binaryMatrix = ds.ToArray(factorVariables, rows);
399      double[,] doubleVarMatrix = ds.ToArray(doubleVariables, rows);
400      var x = binaryMatrix.HorzCat(doubleVarMatrix);
401      return x.Transpose();
402    }
[13927]403  }
404}
Note: See TracBrowser for help on using the repository browser.