1 | #region License Information
|
---|
2 | /* HeuristicLab
|
---|
3 | * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
|
---|
4 | *
|
---|
5 | * This file is part of HeuristicLab.
|
---|
6 | *
|
---|
7 | * HeuristicLab is free software: you can redistribute it and/or modify
|
---|
8 | * it under the terms of the GNU General Public License as published by
|
---|
9 | * the Free Software Foundation, either version 3 of the License, or
|
---|
10 | * (at your option) any later version.
|
---|
11 | *
|
---|
12 | * HeuristicLab is distributed in the hope that it will be useful,
|
---|
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
15 | * GNU General Public License for more details.
|
---|
16 | *
|
---|
17 | * You should have received a copy of the GNU General Public License
|
---|
18 | * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
|
---|
19 | */
|
---|
20 | #endregion
|
---|
21 |
|
---|
22 | using System;
|
---|
23 | using System.Linq;
|
---|
24 | using System.Threading;
|
---|
25 | using HeuristicLab.Analysis;
|
---|
26 | using HeuristicLab.Common;
|
---|
27 | using HeuristicLab.Core;
|
---|
28 | using HeuristicLab.Data;
|
---|
29 | using HeuristicLab.Optimization;
|
---|
30 | using HeuristicLab.Parameters;
|
---|
31 | using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
|
---|
32 | using HeuristicLab.Problems.DataAnalysis;
|
---|
33 | using HeuristicLab.Problems.DataAnalysis.Symbolic;
|
---|
34 | using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;
|
---|
35 |
|
---|
36 | namespace HeuristicLab.Algorithms.DataAnalysis.Glmnet {
|
---|
37 | [Item("Elastic-net Linear Regression (LR)", "Linear regression with elastic-net regularization (wrapper for glmnet)")]
|
---|
38 | [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 110)]
|
---|
39 | [StorableClass]
|
---|
40 | public sealed class ElasticNetLinearRegression : FixedDataAnalysisAlgorithm<IRegressionProblem> {
|
---|
41 | private const string PenalityParameterName = "Penality";
|
---|
42 | private const string LambdaParameterName = "Lambda";
|
---|
43 | #region parameters
|
---|
44 | public IFixedValueParameter<DoubleValue> PenalityParameter {
|
---|
45 | get { return (IFixedValueParameter<DoubleValue>)Parameters[PenalityParameterName]; }
|
---|
46 | }
|
---|
47 | public IValueParameter<DoubleValue> LambdaParameter {
|
---|
48 | get { return (IValueParameter<DoubleValue>)Parameters[LambdaParameterName]; }
|
---|
49 | }
|
---|
50 | #endregion
|
---|
51 | #region properties
|
---|
52 | public double Penality {
|
---|
53 | get { return PenalityParameter.Value.Value; }
|
---|
54 | set { PenalityParameter.Value.Value = value; }
|
---|
55 | }
|
---|
56 | public DoubleValue Lambda {
|
---|
57 | get { return LambdaParameter.Value; }
|
---|
58 | set { LambdaParameter.Value = value; }
|
---|
59 | }
|
---|
60 | #endregion
|
---|
61 |
|
---|
62 | [StorableConstructor]
|
---|
63 | private ElasticNetLinearRegression(bool deserializing) : base(deserializing) { }
|
---|
64 | private ElasticNetLinearRegression(ElasticNetLinearRegression original, Cloner cloner)
|
---|
65 | : base(original, cloner) {
|
---|
66 | }
|
---|
67 | public ElasticNetLinearRegression()
|
---|
68 | : base() {
|
---|
69 | Problem = new RegressionProblem();
|
---|
70 | Parameters.Add(new FixedValueParameter<DoubleValue>(PenalityParameterName, "Penalty factor (alpha) for balancing between ridge (0.0) and lasso (1.0) regression", new DoubleValue(0.5)));
|
---|
71 | Parameters.Add(new OptionalValueParameter<DoubleValue>(LambdaParameterName, "Optional: the value of lambda for which to calculate an elastic-net solution. lambda == null => calculate the whole path of all lambdas"));
|
---|
72 | }
|
---|
73 |
|
---|
74 | [StorableHook(HookType.AfterDeserialization)]
|
---|
75 | private void AfterDeserialization() { }
|
---|
76 |
|
---|
77 | public override IDeepCloneable Clone(Cloner cloner) {
|
---|
78 | return new ElasticNetLinearRegression(this, cloner);
|
---|
79 | }
|
---|
80 |
|
---|
81 | protected override void Run(CancellationToken cancellationToken) {
|
---|
82 | if (Lambda == null) {
|
---|
83 | CreateSolutionPath();
|
---|
84 | } else {
|
---|
85 | CreateSolution(Lambda.Value);
|
---|
86 | }
|
---|
87 | }
|
---|
88 |
|
---|
89 | private void CreateSolution(double lambda) {
|
---|
90 | double trainNMSE;
|
---|
91 | double testNMSE;
|
---|
92 | var coeff = CreateElasticNetLinearRegressionSolution(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE);
|
---|
93 | Results.Add(new Result("NMSE (train)", new DoubleValue(trainNMSE)));
|
---|
94 | Results.Add(new Result("NMSE (test)", new DoubleValue(testNMSE)));
|
---|
95 |
|
---|
96 | var allVariables = Problem.ProblemData.AllowedInputVariables.ToArray();
|
---|
97 |
|
---|
98 | var remainingVars = Enumerable.Range(0, allVariables.Length)
|
---|
99 | .Where(idx => !coeff[idx].IsAlmost(0.0)).Select(idx => allVariables[idx])
|
---|
100 | .ToArray();
|
---|
101 | var remainingCoeff = Enumerable.Range(0, allVariables.Length)
|
---|
102 | .Select(idx => coeff[idx])
|
---|
103 | .Where(c => !c.IsAlmost(0.0))
|
---|
104 | .ToArray();
|
---|
105 |
|
---|
106 | var tree = LinearModelToTreeConverter.CreateTree(remainingVars, remainingCoeff, coeff.Last());
|
---|
107 |
|
---|
108 |
|
---|
109 | SymbolicRegressionSolution solution = new SymbolicRegressionSolution(
|
---|
110 | new SymbolicRegressionModel(Problem.ProblemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()),
|
---|
111 | (IRegressionProblemData)Problem.ProblemData.Clone());
|
---|
112 | solution.Model.Name = "Elastic-net Linear Regression Model";
|
---|
113 | solution.Name = "Elastic-net Linear Regression Solution";
|
---|
114 |
|
---|
115 | Results.Add(new Result(solution.Name, solution.Description, solution));
|
---|
116 | }
|
---|
117 |
|
---|
118 | private void CreateSolutionPath() {
|
---|
119 | double[] lambda;
|
---|
120 | double[] trainNMSE;
|
---|
121 | double[] testNMSE;
|
---|
122 | double[,] coeff;
|
---|
123 | double[] intercept;
|
---|
124 | RunElasticNetLinearRegression(Problem.ProblemData, Penality, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept);
|
---|
125 |
|
---|
126 | var coeffTable = new IndexedDataTable<double>("Coefficients", "The paths of standarized coefficient values over different lambda values");
|
---|
127 | coeffTable.VisualProperties.YAxisMaximumAuto = false;
|
---|
128 | coeffTable.VisualProperties.YAxisMinimumAuto = false;
|
---|
129 | coeffTable.VisualProperties.XAxisMaximumAuto = false;
|
---|
130 | coeffTable.VisualProperties.XAxisMinimumAuto = false;
|
---|
131 |
|
---|
132 | coeffTable.VisualProperties.XAxisLogScale = true;
|
---|
133 | coeffTable.VisualProperties.XAxisTitle = "Lambda";
|
---|
134 | coeffTable.VisualProperties.YAxisTitle = "Coefficients";
|
---|
135 | coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables";
|
---|
136 |
|
---|
137 | var nLambdas = lambda.Length;
|
---|
138 | var nCoeff = coeff.GetLength(1);
|
---|
139 | var dataRows = new IndexedDataRow<double>[nCoeff];
|
---|
140 | var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray();
|
---|
141 | var numNonZeroCoeffs = new int[nLambdas];
|
---|
142 | for (int i = 0; i < nCoeff; i++) {
|
---|
143 | var coeffId = allowedVars[i];
|
---|
144 | double sigma = Problem.ProblemData.Dataset.GetDoubleValues(coeffId).StandardDeviation();
|
---|
145 | var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
|
---|
146 | dataRows[i] = new IndexedDataRow<double>(coeffId, coeffId, path);
|
---|
147 | }
|
---|
148 | // add to coeffTable by total weight (larger area under the curve => more important);
|
---|
149 | foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) {
|
---|
150 | coeffTable.Rows.Add(r);
|
---|
151 | }
|
---|
152 |
|
---|
153 | for (int i = 0; i < coeff.GetLength(0); i++) {
|
---|
154 | for (int j = 0; j < coeff.GetLength(1); j++) {
|
---|
155 | if (!coeff[i, j].IsAlmost(0.0)) {
|
---|
156 | numNonZeroCoeffs[i]++;
|
---|
157 | }
|
---|
158 | }
|
---|
159 | }
|
---|
160 | if (lambda.Length > 2) {
|
---|
161 | coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
|
---|
162 | coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
|
---|
163 | }
|
---|
164 | coeffTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
|
---|
165 | coeffTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
166 | coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
|
---|
167 |
|
---|
168 | Results.Add(new Result(coeffTable.Name, coeffTable.Description, coeffTable));
|
---|
169 |
|
---|
170 | var errorTable = new IndexedDataTable<double>("NMSE", "Path of NMSE values over different lambda values");
|
---|
171 | errorTable.VisualProperties.YAxisMaximumAuto = false;
|
---|
172 | errorTable.VisualProperties.YAxisMinimumAuto = false;
|
---|
173 | errorTable.VisualProperties.XAxisMaximumAuto = false;
|
---|
174 | errorTable.VisualProperties.XAxisMinimumAuto = false;
|
---|
175 |
|
---|
176 | errorTable.VisualProperties.YAxisMinimumFixedValue = 0;
|
---|
177 | errorTable.VisualProperties.YAxisMaximumFixedValue = 1.0;
|
---|
178 | errorTable.VisualProperties.XAxisLogScale = true;
|
---|
179 | errorTable.VisualProperties.XAxisTitle = "Lambda";
|
---|
180 | errorTable.VisualProperties.YAxisTitle = "Normalized mean of squared errors (NMSE)";
|
---|
181 | errorTable.VisualProperties.SecondYAxisTitle = "Number of variables";
|
---|
182 | errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (train)", "Path of NMSE values over different lambda values", lambda.Zip(trainNMSE, (l, v) => Tuple.Create(l, v))));
|
---|
183 | errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (test)", "Path of NMSE values over different lambda values", lambda.Zip(testNMSE, (l, v) => Tuple.Create(l, v))));
|
---|
184 | errorTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
|
---|
185 | if (lambda.Length > 2) {
|
---|
186 | errorTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
|
---|
187 | errorTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
|
---|
188 | }
|
---|
189 | errorTable.Rows["NMSE (train)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
190 | errorTable.Rows["NMSE (test)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
191 | errorTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
192 | errorTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
|
---|
193 |
|
---|
194 | Results.Add(new Result(errorTable.Name, errorTable.Description, errorTable));
|
---|
195 | }
|
---|
196 |
|
---|
197 | public static double[] CreateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double lambda,
|
---|
198 | out double trainNMSE, out double testNMSE,
|
---|
199 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) {
|
---|
200 | double[] trainNMSEs;
|
---|
201 | double[] testNMSEs;
|
---|
202 | // run for exactly one lambda
|
---|
203 | var coeffs = CreateElasticNetLinearRegressionSolution(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound);
|
---|
204 | trainNMSE = trainNMSEs[0];
|
---|
205 | testNMSE = testNMSEs[0];
|
---|
206 | return coeffs[0];
|
---|
207 | }
|
---|
208 | public static double[][] CreateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double[] lambda,
|
---|
209 | out double[] trainNMSEs, out double[] testNMSEs,
|
---|
210 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
|
---|
211 | int maxVars = -1) {
|
---|
212 | // run for multiple user-supplied lambdas
|
---|
213 | double[,] coeff;
|
---|
214 | double[] intercept;
|
---|
215 | RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainNMSEs, out testNMSEs, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
|
---|
216 |
|
---|
217 | int nRows = intercept.Length;
|
---|
218 | int nCols = coeff.GetLength(1) + 1;
|
---|
219 | double[][] sols = new double[nRows][];
|
---|
220 | for (int solIdx = 0; solIdx < nRows; solIdx++) {
|
---|
221 | sols[solIdx] = new double[nCols];
|
---|
222 | for (int cIdx = 0; cIdx < nCols - 1; cIdx++) {
|
---|
223 | sols[solIdx][cIdx] = coeff[solIdx, cIdx];
|
---|
224 | }
|
---|
225 | sols[solIdx][nCols - 1] = intercept[solIdx];
|
---|
226 | }
|
---|
227 | return sols;
|
---|
228 | }
|
---|
229 |
|
---|
230 | public static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
|
---|
231 | out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
|
---|
232 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
|
---|
233 | int maxVars = -1
|
---|
234 | ) {
|
---|
235 | double[] userLambda = new double[0];
|
---|
236 | // automatically determine lambda values (maximum 100 different lambda values)
|
---|
237 | RunElasticNetLinearRegression(problemData, penalty, 100, 0.0, userLambda, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
|
---|
238 | }
|
---|
239 |
|
---|
240 | /// <summary>
|
---|
241 | /// Elastic net with squared-error-loss for dense predictor matrix, runs the full path of all lambdas
|
---|
242 | /// </summary>
|
---|
243 | /// <param name="problemData">Predictor target matrix x and target vector y</param>
|
---|
244 | /// <param name="penalty">Penalty for balance between ridge (0.0) and lasso (1.0) regression</param>
|
---|
245 | /// <param name="nlam">Maximum number of lambda values (default 100)</param>
|
---|
246 | /// <param name="flmin">User control of lambda values (<1.0 => minimum lambda = flmin * (largest lambda value), >= 1.0 => use supplied lambda values</param>
|
---|
247 | /// <param name="ulam">User supplied lambda values</param>
|
---|
248 | /// <param name="lambda">Output lambda values</param>
|
---|
249 | /// <param name="trainNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the training set for each set of coefficients along the path</param>
|
---|
250 | /// <param name="testNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the test set for each set of coefficients along the path</param>
|
---|
251 | /// <param name="coeff">Vector of coefficient vectors for each solution along the path</param>
|
---|
252 | /// <param name="intercept">Vector of intercepts for each solution along the path</param>
|
---|
253 | /// <param name="coeffLowerBound">Optional lower bound for all coefficients</param>
|
---|
254 | /// <param name="coeffUpperBound">Optional upper bound for all coefficients</param>
|
---|
255 | /// <param name="maxVars">Maximum allowed number of variables in each solution along the path (-1 => all variables are allowed)</param>
|
---|
256 | private static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
|
---|
257 | int nlam, double flmin, double[] ulam, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
|
---|
258 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
|
---|
259 | int maxVars = -1
|
---|
260 | ) {
|
---|
261 | if (penalty < 0.0 || penalty > 1.0) throw new ArgumentException("0 <= penalty <= 1", "penalty");
|
---|
262 |
|
---|
263 | double[,] trainX;
|
---|
264 | double[,] testX;
|
---|
265 | double[] trainY;
|
---|
266 | double[] testY;
|
---|
267 |
|
---|
268 | PrepareData(problemData, out trainX, out trainY, out testX, out testY);
|
---|
269 | var numTrainObs = trainX.GetLength(1);
|
---|
270 | var numTestObs = testX.GetLength(1);
|
---|
271 | var numVars = trainX.GetLength(0);
|
---|
272 |
|
---|
273 | int ka = 1; // => covariance updating algorithm
|
---|
274 | double parm = penalty;
|
---|
275 | double[] w = Enumerable.Repeat(1.0, numTrainObs).ToArray(); // all observations have the same weight
|
---|
276 | int[] jd = new int[1]; // do not force to use any of the variables
|
---|
277 | double[] vp = Enumerable.Repeat(1.0, numVars).ToArray(); // all predictor variables are unpenalized
|
---|
278 | double[,] cl = new double[numVars, 2]; // use the same bounds for all coefficients
|
---|
279 | for (int i = 0; i < numVars; i++) {
|
---|
280 | cl[i, 0] = coeffLowerBound;
|
---|
281 | cl[i, 1] = coeffUpperBound;
|
---|
282 | }
|
---|
283 |
|
---|
284 | int ne = maxVars > 0 ? maxVars : numVars;
|
---|
285 | int nx = numVars;
|
---|
286 | double thr = 1.0e-5; // default value as recommended in glmnet
|
---|
287 | int isd = 1; // => regression on standardized predictor variables
|
---|
288 | int intr = 1; // => do include intercept in model
|
---|
289 | int maxit = 100000; // default value as recommended in glmnet
|
---|
290 | // outputs
|
---|
291 | int lmu = -1;
|
---|
292 | double[,] ca;
|
---|
293 | int[] ia;
|
---|
294 | int[] nin;
|
---|
295 | int nlp = -99;
|
---|
296 | int jerr = -99;
|
---|
297 | double[] trainR2;
|
---|
298 | Glmnet.elnet(ka, parm, numTrainObs, numVars, trainX, trainY, w, jd, vp, cl, ne, nx, nlam, flmin, ulam, thr, isd, intr, maxit, out lmu, out intercept, out ca, out ia, out nin, out trainR2, out lambda, out nlp, out jerr);
|
---|
299 |
|
---|
300 | trainNMSE = new double[lmu]; // elnet returns R**2 as 1 - NMSE
|
---|
301 | testNMSE = new double[lmu];
|
---|
302 | coeff = new double[lmu, numVars];
|
---|
303 | for (int solIdx = 0; solIdx < lmu; solIdx++) {
|
---|
304 | trainNMSE[solIdx] = 1.0 - trainR2[solIdx];
|
---|
305 |
|
---|
306 | // uncompress coefficients of solution
|
---|
307 | int selectedNin = nin[solIdx];
|
---|
308 | double[] coefficients;
|
---|
309 | double[] selectedCa = new double[nx];
|
---|
310 | for (int i = 0; i < nx; i++) {
|
---|
311 | selectedCa[i] = ca[solIdx, i];
|
---|
312 | }
|
---|
313 |
|
---|
314 | // apply to test set to calculate test NMSE values for each lambda step
|
---|
315 | double[] fn;
|
---|
316 | Glmnet.modval(intercept[solIdx], selectedCa, ia, selectedNin, numTestObs, testX, out fn);
|
---|
317 | OnlineCalculatorError error;
|
---|
318 | var nmse = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(testY, fn, out error);
|
---|
319 | if (error != OnlineCalculatorError.None) nmse = double.NaN;
|
---|
320 | testNMSE[solIdx] = nmse;
|
---|
321 |
|
---|
322 | // uncompress coefficients
|
---|
323 | Glmnet.uncomp(numVars, selectedCa, ia, selectedNin, out coefficients);
|
---|
324 | for (int i = 0; i < coefficients.Length; i++) {
|
---|
325 | coeff[solIdx, i] = coefficients[i];
|
---|
326 | }
|
---|
327 | }
|
---|
328 | }
|
---|
329 |
|
---|
330 | private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY,
|
---|
331 | out double[,] testX, out double[] testY) {
|
---|
332 |
|
---|
333 | var ds = problemData.Dataset;
|
---|
334 | trainX = ds.ToArray(problemData.AllowedInputVariables, problemData.TrainingIndices);
|
---|
335 | trainX = trainX.Transpose();
|
---|
336 | trainY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable,
|
---|
337 | problemData.TrainingIndices)
|
---|
338 | .ToArray();
|
---|
339 | testX = ds.ToArray(problemData.AllowedInputVariables, problemData.TestIndices);
|
---|
340 | testX = testX.Transpose();
|
---|
341 | testY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable,
|
---|
342 | problemData.TestIndices)
|
---|
343 | .ToArray();
|
---|
344 | }
|
---|
345 | }
|
---|
346 | }
|
---|