1 | #region License Information
|
---|
2 | /* HeuristicLab
|
---|
3 | * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
|
---|
4 | *
|
---|
5 | * This file is part of HeuristicLab.
|
---|
6 | *
|
---|
7 | * HeuristicLab is free software: you can redistribute it and/or modify
|
---|
8 | * it under the terms of the GNU General Public License as published by
|
---|
9 | * the Free Software Foundation, either version 3 of the License, or
|
---|
10 | * (at your option) any later version.
|
---|
11 | *
|
---|
12 | * HeuristicLab is distributed in the hope that it will be useful,
|
---|
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
15 | * GNU General Public License for more details.
|
---|
16 | *
|
---|
17 | * You should have received a copy of the GNU General Public License
|
---|
18 | * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
|
---|
19 | */
|
---|
20 | #endregion
|
---|
21 |
|
---|
22 | using System;
|
---|
23 | using System.Collections.Generic;
|
---|
24 | using System.Linq;
|
---|
25 | using System.Threading;
|
---|
26 | using HeuristicLab.Analysis;
|
---|
27 | using HeuristicLab.Common;
|
---|
28 | using HeuristicLab.Core;
|
---|
29 | using HeuristicLab.Data;
|
---|
30 | using HeuristicLab.Optimization;
|
---|
31 | using HeuristicLab.Parameters;
|
---|
32 | using HEAL.Attic;
|
---|
33 | using HeuristicLab.Problems.DataAnalysis;
|
---|
34 | using HeuristicLab.Problems.DataAnalysis.Symbolic;
|
---|
35 | using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;
|
---|
36 |
|
---|
37 | namespace HeuristicLab.Algorithms.DataAnalysis.Glmnet {
|
---|
38 | [Item("Elastic-net Linear Regression (LR)", "Linear regression with elastic-net regularization (wrapper for glmnet)")]
|
---|
39 | [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 110)]
|
---|
40 | [StorableType("529EDD40-91F3-4F3E-929F-852A3EF9B02B")]
|
---|
41 | public sealed class ElasticNetLinearRegression : FixedDataAnalysisAlgorithm<IRegressionProblem> {
|
---|
42 | private const string PenalityParameterName = "Penality";
|
---|
43 | private const string LambdaParameterName = "Lambda";
|
---|
44 | #region parameters
|
---|
45 | public IFixedValueParameter<DoubleValue> PenalityParameter {
|
---|
46 | get { return (IFixedValueParameter<DoubleValue>)Parameters[PenalityParameterName]; }
|
---|
47 | }
|
---|
48 | public IValueParameter<DoubleValue> LambdaParameter {
|
---|
49 | get { return (IValueParameter<DoubleValue>)Parameters[LambdaParameterName]; }
|
---|
50 | }
|
---|
51 | #endregion
|
---|
52 | #region properties
|
---|
53 | public double Penality {
|
---|
54 | get { return PenalityParameter.Value.Value; }
|
---|
55 | set { PenalityParameter.Value.Value = value; }
|
---|
56 | }
|
---|
57 | public DoubleValue Lambda {
|
---|
58 | get { return LambdaParameter.Value; }
|
---|
59 | set { LambdaParameter.Value = value; }
|
---|
60 | }
|
---|
61 | #endregion
|
---|
62 |
|
---|
63 | [StorableConstructor]
|
---|
64 | private ElasticNetLinearRegression(StorableConstructorFlag _) : base(_) { }
|
---|
65 | private ElasticNetLinearRegression(ElasticNetLinearRegression original, Cloner cloner)
|
---|
66 | : base(original, cloner) {
|
---|
67 | }
|
---|
68 | public ElasticNetLinearRegression()
|
---|
69 | : base() {
|
---|
70 | Problem = new RegressionProblem();
|
---|
71 | Parameters.Add(new FixedValueParameter<DoubleValue>(PenalityParameterName, "Penalty factor (alpha) for balancing between ridge (0.0) and lasso (1.0) regression", new DoubleValue(0.5)));
|
---|
72 | Parameters.Add(new OptionalValueParameter<DoubleValue>(LambdaParameterName, "Optional: the value of lambda for which to calculate an elastic-net solution. lambda == null => calculate the whole path of all lambdas"));
|
---|
73 | }
|
---|
74 |
|
---|
75 | [StorableHook(HookType.AfterDeserialization)]
|
---|
76 | private void AfterDeserialization() { }
|
---|
77 |
|
---|
78 | public override IDeepCloneable Clone(Cloner cloner) {
|
---|
79 | return new ElasticNetLinearRegression(this, cloner);
|
---|
80 | }
|
---|
81 |
|
---|
82 | protected override void Run(CancellationToken cancellationToken) {
|
---|
83 | if (Lambda == null) {
|
---|
84 | CreateSolutionPath();
|
---|
85 | } else {
|
---|
86 | CreateSolution(Lambda.Value);
|
---|
87 | }
|
---|
88 | }
|
---|
89 |
|
---|
90 | private void CreateSolution(double lambda) {
|
---|
91 | double trainNMSE;
|
---|
92 | double testNMSE;
|
---|
93 | var coeff = CalculateModelCoefficients(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE);
|
---|
94 | Results.Add(new Result("NMSE (train)", new DoubleValue(trainNMSE)));
|
---|
95 | Results.Add(new Result("NMSE (test)", new DoubleValue(testNMSE)));
|
---|
96 |
|
---|
97 | var solution = CreateSymbolicSolution(coeff, Problem.ProblemData);
|
---|
98 | Results.Add(new Result(solution.Name, solution.Description, solution));
|
---|
99 | }
|
---|
100 |
|
---|
101 | public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData) {
|
---|
102 | var ds = problemData.Dataset;
|
---|
103 | var allVariables = problemData.AllowedInputVariables.ToArray();
|
---|
104 | var doubleVariables = allVariables.Where(ds.VariableHasType<double>);
|
---|
105 | var factorVariableNames = allVariables.Where(ds.VariableHasType<string>);
|
---|
106 | var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
|
---|
107 |
|
---|
108 | List<KeyValuePair<string, IEnumerable<string>>> remainingFactorVariablesAndValues = new List<KeyValuePair<string, IEnumerable<string>>>();
|
---|
109 | List<double> factorCoeff = new List<double>();
|
---|
110 | List<string> remainingDoubleVariables = new List<string>();
|
---|
111 | List<double> doubleVarCoeff = new List<double>();
|
---|
112 |
|
---|
113 | {
|
---|
114 | int i = 0;
|
---|
115 | // find factor varibles & value combinations with non-zero coeff
|
---|
116 | foreach (var factorVarAndValues in factorVariablesAndValues) {
|
---|
117 | var l = new List<string>();
|
---|
118 | foreach (var factorValue in factorVarAndValues.Value) {
|
---|
119 | if (!coeff[i].IsAlmost(0.0)) {
|
---|
120 | l.Add(factorValue);
|
---|
121 | factorCoeff.Add(coeff[i]);
|
---|
122 | }
|
---|
123 | i++;
|
---|
124 | }
|
---|
125 | if (l.Any()) remainingFactorVariablesAndValues.Add(new KeyValuePair<string, IEnumerable<string>>(factorVarAndValues.Key, l));
|
---|
126 | }
|
---|
127 | // find double variables with non-zero coeff
|
---|
128 | foreach (var doubleVar in doubleVariables) {
|
---|
129 | if (!coeff[i].IsAlmost(0.0)) {
|
---|
130 | remainingDoubleVariables.Add(doubleVar);
|
---|
131 | doubleVarCoeff.Add(coeff[i]);
|
---|
132 | }
|
---|
133 | i++;
|
---|
134 | }
|
---|
135 | }
|
---|
136 | var tree = LinearModelToTreeConverter.CreateTree(
|
---|
137 | remainingFactorVariablesAndValues, factorCoeff.ToArray(),
|
---|
138 | remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(),
|
---|
139 | coeff.Last());
|
---|
140 |
|
---|
141 |
|
---|
142 | SymbolicRegressionSolution solution = new SymbolicRegressionSolution(
|
---|
143 | new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()),
|
---|
144 | (IRegressionProblemData)problemData.Clone());
|
---|
145 | solution.Model.Name = "Elastic-net Linear Regression Model";
|
---|
146 | solution.Name = "Elastic-net Linear Regression Solution";
|
---|
147 |
|
---|
148 | return solution;
|
---|
149 | }
|
---|
150 |
|
---|
151 | private void CreateSolutionPath() {
|
---|
152 | double[] lambda;
|
---|
153 | double[] trainNMSE;
|
---|
154 | double[] testNMSE;
|
---|
155 | double[,] coeff;
|
---|
156 | double[] intercept;
|
---|
157 | RunElasticNetLinearRegression(Problem.ProblemData, Penality, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept);
|
---|
158 |
|
---|
159 | var coeffTable = new IndexedDataTable<double>("Coefficients", "The paths of standarized coefficient values over different lambda values");
|
---|
160 | coeffTable.VisualProperties.YAxisMaximumAuto = false;
|
---|
161 | coeffTable.VisualProperties.YAxisMinimumAuto = false;
|
---|
162 | coeffTable.VisualProperties.XAxisMaximumAuto = false;
|
---|
163 | coeffTable.VisualProperties.XAxisMinimumAuto = false;
|
---|
164 |
|
---|
165 | coeffTable.VisualProperties.XAxisLogScale = true;
|
---|
166 | coeffTable.VisualProperties.XAxisTitle = "Lambda";
|
---|
167 | coeffTable.VisualProperties.YAxisTitle = "Coefficients";
|
---|
168 | coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables";
|
---|
169 |
|
---|
170 | var nLambdas = lambda.Length;
|
---|
171 | var nCoeff = coeff.GetLength(1);
|
---|
172 | var dataRows = new IndexedDataRow<double>[nCoeff];
|
---|
173 | var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray();
|
---|
174 | var numNonZeroCoeffs = new int[nLambdas];
|
---|
175 |
|
---|
176 | var ds = Problem.ProblemData.Dataset;
|
---|
177 | var doubleVariables = allowedVars.Where(ds.VariableHasType<double>);
|
---|
178 | var factorVariableNames = allowedVars.Where(ds.VariableHasType<string>);
|
---|
179 | var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
|
---|
180 | {
|
---|
181 | int i = 0;
|
---|
182 | foreach (var factorVariableAndValues in factorVariablesAndValues) {
|
---|
183 | foreach (var factorValue in factorVariableAndValues.Value) {
|
---|
184 | double sigma = ds.GetStringValues(factorVariableAndValues.Key)
|
---|
185 | .Select(s => s == factorValue ? 1.0 : 0.0)
|
---|
186 | .StandardDeviation(); // calc std dev of binary indicator
|
---|
187 | var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
|
---|
188 | dataRows[i] = new IndexedDataRow<double>(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path);
|
---|
189 | i++;
|
---|
190 | }
|
---|
191 | }
|
---|
192 |
|
---|
193 | foreach (var doubleVariable in doubleVariables) {
|
---|
194 | double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation();
|
---|
195 | var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
|
---|
196 | dataRows[i] = new IndexedDataRow<double>(doubleVariable, doubleVariable, path);
|
---|
197 | i++;
|
---|
198 | }
|
---|
199 | // add to coeffTable by total weight (larger area under the curve => more important);
|
---|
200 | foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) {
|
---|
201 | coeffTable.Rows.Add(r);
|
---|
202 | }
|
---|
203 | }
|
---|
204 |
|
---|
205 | for (int i = 0; i < coeff.GetLength(0); i++) {
|
---|
206 | for (int j = 0; j < coeff.GetLength(1); j++) {
|
---|
207 | if (!coeff[i, j].IsAlmost(0.0)) {
|
---|
208 | numNonZeroCoeffs[i]++;
|
---|
209 | }
|
---|
210 | }
|
---|
211 | }
|
---|
212 | if (lambda.Length > 2) {
|
---|
213 | coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
|
---|
214 | coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
|
---|
215 | }
|
---|
216 | coeffTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
|
---|
217 | coeffTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
218 | coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
|
---|
219 |
|
---|
220 | Results.Add(new Result(coeffTable.Name, coeffTable.Description, coeffTable));
|
---|
221 |
|
---|
222 | var errorTable = new IndexedDataTable<double>("NMSE", "Path of NMSE values over different lambda values");
|
---|
223 | errorTable.VisualProperties.YAxisMaximumAuto = false;
|
---|
224 | errorTable.VisualProperties.YAxisMinimumAuto = false;
|
---|
225 | errorTable.VisualProperties.XAxisMaximumAuto = false;
|
---|
226 | errorTable.VisualProperties.XAxisMinimumAuto = false;
|
---|
227 |
|
---|
228 | errorTable.VisualProperties.YAxisMinimumFixedValue = 0;
|
---|
229 | errorTable.VisualProperties.YAxisMaximumFixedValue = 1.0;
|
---|
230 | errorTable.VisualProperties.XAxisLogScale = true;
|
---|
231 | errorTable.VisualProperties.XAxisTitle = "Lambda";
|
---|
232 | errorTable.VisualProperties.YAxisTitle = "Normalized mean of squared errors (NMSE)";
|
---|
233 | errorTable.VisualProperties.SecondYAxisTitle = "Number of variables";
|
---|
234 | errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (train)", "Path of NMSE values over different lambda values", lambda.Zip(trainNMSE, (l, v) => Tuple.Create(l, v))));
|
---|
235 | errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (test)", "Path of NMSE values over different lambda values", lambda.Zip(testNMSE, (l, v) => Tuple.Create(l, v))));
|
---|
236 | errorTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
|
---|
237 | if (lambda.Length > 2) {
|
---|
238 | errorTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
|
---|
239 | errorTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
|
---|
240 | }
|
---|
241 | errorTable.Rows["NMSE (train)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
242 | errorTable.Rows["NMSE (test)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
243 | errorTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
244 | errorTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
|
---|
245 |
|
---|
246 | Results.Add(new Result(errorTable.Name, errorTable.Description, errorTable));
|
---|
247 | }
|
---|
248 |
|
---|
249 | public static double[] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double lambda,
|
---|
250 | out double trainNMSE, out double testNMSE,
|
---|
251 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) {
|
---|
252 | double[] trainNMSEs;
|
---|
253 | double[] testNMSEs;
|
---|
254 | // run for exactly one lambda
|
---|
255 | var coeffs = CalculateModelCoefficients(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound);
|
---|
256 | trainNMSE = trainNMSEs[0];
|
---|
257 | testNMSE = testNMSEs[0];
|
---|
258 | return coeffs[0];
|
---|
259 | }
|
---|
260 | public static double[][] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double[] lambda,
|
---|
261 | out double[] trainNMSEs, out double[] testNMSEs,
|
---|
262 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
|
---|
263 | int maxVars = -1) {
|
---|
264 | // run for multiple user-supplied lambdas
|
---|
265 | double[,] coeff;
|
---|
266 | double[] intercept;
|
---|
267 | RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainNMSEs, out testNMSEs, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
|
---|
268 |
|
---|
269 | int nRows = intercept.Length;
|
---|
270 | int nCols = coeff.GetLength(1) + 1;
|
---|
271 | double[][] sols = new double[nRows][];
|
---|
272 | for (int solIdx = 0; solIdx < nRows; solIdx++) {
|
---|
273 | sols[solIdx] = new double[nCols];
|
---|
274 | for (int cIdx = 0; cIdx < nCols - 1; cIdx++) {
|
---|
275 | sols[solIdx][cIdx] = coeff[solIdx, cIdx];
|
---|
276 | }
|
---|
277 | sols[solIdx][nCols - 1] = intercept[solIdx];
|
---|
278 | }
|
---|
279 | return sols;
|
---|
280 | }
|
---|
281 |
|
---|
282 | public static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
|
---|
283 | out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
|
---|
284 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
|
---|
285 | int maxVars = -1
|
---|
286 | ) {
|
---|
287 | double[] userLambda = new double[0];
|
---|
288 | // automatically determine lambda values (maximum 100 different lambda values)
|
---|
289 | RunElasticNetLinearRegression(problemData, penalty, 100, 0.0, userLambda, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
|
---|
290 | }
|
---|
291 |
|
---|
292 | /// <summary>
|
---|
293 | /// Elastic net with squared-error-loss for dense predictor matrix, runs the full path of all lambdas
|
---|
294 | /// </summary>
|
---|
295 | /// <param name="problemData">Predictor target matrix x and target vector y</param>
|
---|
296 | /// <param name="penalty">Penalty for balance between ridge (0.0) and lasso (1.0) regression</param>
|
---|
297 | /// <param name="nlam">Maximum number of lambda values (default 100)</param>
|
---|
298 | /// <param name="flmin">User control of lambda values (<1.0 => minimum lambda = flmin * (largest lambda value), >= 1.0 => use supplied lambda values</param>
|
---|
299 | /// <param name="ulam">User supplied lambda values</param>
|
---|
300 | /// <param name="lambda">Output lambda values</param>
|
---|
301 | /// <param name="trainNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the training set for each set of coefficients along the path</param>
|
---|
302 | /// <param name="testNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the test set for each set of coefficients along the path</param>
|
---|
303 | /// <param name="coeff">Vector of coefficient vectors for each solution along the path</param>
|
---|
304 | /// <param name="intercept">Vector of intercepts for each solution along the path</param>
|
---|
305 | /// <param name="coeffLowerBound">Optional lower bound for all coefficients</param>
|
---|
306 | /// <param name="coeffUpperBound">Optional upper bound for all coefficients</param>
|
---|
307 | /// <param name="maxVars">Maximum allowed number of variables in each solution along the path (-1 => all variables are allowed)</param>
|
---|
308 | private static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
|
---|
309 | int nlam, double flmin, double[] ulam, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
|
---|
310 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
|
---|
311 | int maxVars = -1
|
---|
312 | ) {
|
---|
313 | if (penalty < 0.0 || penalty > 1.0) throw new ArgumentException("0 <= penalty <= 1", "penalty");
|
---|
314 |
|
---|
315 | double[,] trainX;
|
---|
316 | double[,] testX;
|
---|
317 | double[] trainY;
|
---|
318 | double[] testY;
|
---|
319 |
|
---|
320 | PrepareData(problemData, out trainX, out trainY, out testX, out testY);
|
---|
321 | var numTrainObs = trainX.GetLength(1);
|
---|
322 | var numTestObs = testX.GetLength(1);
|
---|
323 | var numVars = trainX.GetLength(0);
|
---|
324 |
|
---|
325 | int ka = 1; // => covariance updating algorithm
|
---|
326 | double parm = penalty;
|
---|
327 | double[] w = Enumerable.Repeat(1.0, numTrainObs).ToArray(); // all observations have the same weight
|
---|
328 | int[] jd = new int[1]; // do not force to use any of the variables
|
---|
329 | double[] vp = Enumerable.Repeat(1.0, numVars).ToArray(); // all predictor variables are unpenalized
|
---|
330 | double[,] cl = new double[numVars, 2]; // use the same bounds for all coefficients
|
---|
331 | for (int i = 0; i < numVars; i++) {
|
---|
332 | cl[i, 0] = coeffLowerBound;
|
---|
333 | cl[i, 1] = coeffUpperBound;
|
---|
334 | }
|
---|
335 |
|
---|
336 | int ne = maxVars > 0 ? maxVars : numVars;
|
---|
337 | int nx = numVars;
|
---|
338 | double thr = 1.0e-5; // default value as recommended in glmnet
|
---|
339 | int isd = 1; // => regression on standardized predictor variables
|
---|
340 | int intr = 1; // => do include intercept in model
|
---|
341 | int maxit = 100000; // default value as recommended in glmnet
|
---|
342 | // outputs
|
---|
343 | int lmu = -1;
|
---|
344 | double[,] ca;
|
---|
345 | int[] ia;
|
---|
346 | int[] nin;
|
---|
347 | int nlp = -99;
|
---|
348 | int jerr = -99;
|
---|
349 | double[] trainR2;
|
---|
350 | Glmnet.elnet(ka, parm, numTrainObs, numVars, trainX, trainY, w, jd, vp, cl, ne, nx, nlam, flmin, ulam, thr, isd, intr, maxit, out lmu, out intercept, out ca, out ia, out nin, out trainR2, out lambda, out nlp, out jerr);
|
---|
351 |
|
---|
352 | trainNMSE = new double[lmu]; // elnet returns R**2 as 1 - NMSE
|
---|
353 | testNMSE = new double[lmu];
|
---|
354 | coeff = new double[lmu, numVars];
|
---|
355 | for (int solIdx = 0; solIdx < lmu; solIdx++) {
|
---|
356 | trainNMSE[solIdx] = 1.0 - trainR2[solIdx];
|
---|
357 |
|
---|
358 | // uncompress coefficients of solution
|
---|
359 | int selectedNin = nin[solIdx];
|
---|
360 | double[] coefficients;
|
---|
361 | double[] selectedCa = new double[nx];
|
---|
362 | for (int i = 0; i < nx; i++) {
|
---|
363 | selectedCa[i] = ca[solIdx, i];
|
---|
364 | }
|
---|
365 |
|
---|
366 | // apply to test set to calculate test NMSE values for each lambda step
|
---|
367 | double[] fn;
|
---|
368 | Glmnet.modval(intercept[solIdx], selectedCa, ia, selectedNin, numTestObs, testX, out fn);
|
---|
369 | OnlineCalculatorError error;
|
---|
370 | var nmse = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(testY, fn, out error);
|
---|
371 | if (error != OnlineCalculatorError.None) nmse = double.NaN;
|
---|
372 | testNMSE[solIdx] = nmse;
|
---|
373 |
|
---|
374 | // uncompress coefficients
|
---|
375 | Glmnet.uncomp(numVars, selectedCa, ia, selectedNin, out coefficients);
|
---|
376 | for (int i = 0; i < coefficients.Length; i++) {
|
---|
377 | coeff[solIdx, i] = coefficients[i];
|
---|
378 | }
|
---|
379 | }
|
---|
380 | }
|
---|
381 |
|
---|
382 | private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY,
|
---|
383 | out double[,] testX, out double[] testY) {
|
---|
384 | var ds = problemData.Dataset;
|
---|
385 | var targetVariable = problemData.TargetVariable;
|
---|
386 | var allowedInputs = problemData.AllowedInputVariables;
|
---|
387 | trainX = PrepareInputData(ds, allowedInputs, problemData.TrainingIndices);
|
---|
388 | trainY = ds.GetDoubleValues(targetVariable, problemData.TrainingIndices).ToArray();
|
---|
389 |
|
---|
390 | testX = PrepareInputData(ds, allowedInputs, problemData.TestIndices);
|
---|
391 | testY = ds.GetDoubleValues(targetVariable, problemData.TestIndices).ToArray();
|
---|
392 | }
|
---|
393 |
|
---|
394 | private static double[,] PrepareInputData(IDataset ds, IEnumerable<string> allowedInputs, IEnumerable<int> rows) {
|
---|
395 | var doubleVariables = allowedInputs.Where(ds.VariableHasType<double>);
|
---|
396 | var factorVariableNames = allowedInputs.Where(ds.VariableHasType<string>);
|
---|
397 | var factorVariables = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
|
---|
398 | double[,] binaryMatrix = ds.ToArray(factorVariables, rows);
|
---|
399 | double[,] doubleVarMatrix = ds.ToArray(doubleVariables, rows);
|
---|
400 | var x = binaryMatrix.HorzCat(doubleVarMatrix);
|
---|
401 | return x.Transpose();
|
---|
402 | }
|
---|
403 | }
|
---|
404 | }
|
---|