[14370] | 1 | #region License Information
|
---|
| 2 | /* HeuristicLab
|
---|
[16057] | 3 | * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
|
---|
[14370] | 4 | *
|
---|
| 5 | * This file is part of HeuristicLab.
|
---|
| 6 | *
|
---|
| 7 | * HeuristicLab is free software: you can redistribute it and/or modify
|
---|
| 8 | * it under the terms of the GNU General Public License as published by
|
---|
| 9 | * the Free Software Foundation, either version 3 of the License, or
|
---|
| 10 | * (at your option) any later version.
|
---|
| 11 | *
|
---|
| 12 | * HeuristicLab is distributed in the hope that it will be useful,
|
---|
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 15 | * GNU General Public License for more details.
|
---|
| 16 | *
|
---|
| 17 | * You should have received a copy of the GNU General Public License
|
---|
| 18 | * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
|
---|
| 19 | */
|
---|
| 20 | #endregion
|
---|
| 21 |
|
---|
| 22 | using System;
|
---|
[15023] | 23 | using System.Collections.Generic;
|
---|
[13927] | 24 | using System.Linq;
|
---|
[14674] | 25 | using System.Threading;
|
---|
[13927] | 26 | using HeuristicLab.Analysis;
|
---|
| 27 | using HeuristicLab.Common;
|
---|
| 28 | using HeuristicLab.Core;
|
---|
| 29 | using HeuristicLab.Data;
|
---|
| 30 | using HeuristicLab.Optimization;
|
---|
| 31 | using HeuristicLab.Parameters;
|
---|
| 32 | using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
|
---|
| 33 | using HeuristicLab.Problems.DataAnalysis;
|
---|
[13930] | 34 | using HeuristicLab.Problems.DataAnalysis.Symbolic;
|
---|
| 35 | using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;
|
---|
[13927] | 36 |
|
---|
[14225] | 37 | namespace HeuristicLab.Algorithms.DataAnalysis.Glmnet {
|
---|
[13927] | 38 | [Item("Elastic-net Linear Regression (LR)", "Linear regression with elastic-net regularization (wrapper for glmnet)")]
|
---|
| 39 | [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 110)]
|
---|
| 40 | [StorableClass]
|
---|
| 41 | public sealed class ElasticNetLinearRegression : FixedDataAnalysisAlgorithm<IRegressionProblem> {
|
---|
| 42 | private const string PenalityParameterName = "Penality";
|
---|
[14395] | 43 | private const string LambdaParameterName = "Lambda";
|
---|
[13927] | 44 | #region parameters
|
---|
| 45 | public IFixedValueParameter<DoubleValue> PenalityParameter {
|
---|
| 46 | get { return (IFixedValueParameter<DoubleValue>)Parameters[PenalityParameterName]; }
|
---|
| 47 | }
|
---|
[14395] | 48 | public IValueParameter<DoubleValue> LambdaParameter {
|
---|
| 49 | get { return (IValueParameter<DoubleValue>)Parameters[LambdaParameterName]; }
|
---|
[13930] | 50 | }
|
---|
[13927] | 51 | #endregion
|
---|
| 52 | #region properties
|
---|
| 53 | public double Penality {
|
---|
| 54 | get { return PenalityParameter.Value.Value; }
|
---|
| 55 | set { PenalityParameter.Value.Value = value; }
|
---|
| 56 | }
|
---|
[14395] | 57 | public DoubleValue Lambda {
|
---|
| 58 | get { return LambdaParameter.Value; }
|
---|
| 59 | set { LambdaParameter.Value = value; }
|
---|
[13930] | 60 | }
|
---|
[13927] | 61 | #endregion
|
---|
| 62 |
|
---|
| 63 | [StorableConstructor]
|
---|
| 64 | private ElasticNetLinearRegression(bool deserializing) : base(deserializing) { }
|
---|
| 65 | private ElasticNetLinearRegression(ElasticNetLinearRegression original, Cloner cloner)
|
---|
| 66 | : base(original, cloner) {
|
---|
| 67 | }
|
---|
[13940] | 68 | public ElasticNetLinearRegression()
|
---|
| 69 | : base() {
|
---|
[13927] | 70 | Problem = new RegressionProblem();
|
---|
[14377] | 71 | Parameters.Add(new FixedValueParameter<DoubleValue>(PenalityParameterName, "Penalty factor (alpha) for balancing between ridge (0.0) and lasso (1.0) regression", new DoubleValue(0.5)));
|
---|
[14395] | 72 | Parameters.Add(new OptionalValueParameter<DoubleValue>(LambdaParameterName, "Optional: the value of lambda for which to calculate an elastic-net solution. lambda == null => calculate the whole path of all lambdas"));
|
---|
[13927] | 73 | }
|
---|
| 74 |
|
---|
| 75 | [StorableHook(HookType.AfterDeserialization)]
|
---|
| 76 | private void AfterDeserialization() { }
|
---|
| 77 |
|
---|
| 78 | public override IDeepCloneable Clone(Cloner cloner) {
|
---|
| 79 | return new ElasticNetLinearRegression(this, cloner);
|
---|
| 80 | }
|
---|
| 81 |
|
---|
[14674] | 82 | protected override void Run(CancellationToken cancellationToken) {
|
---|
[14395] | 83 | if (Lambda == null) {
|
---|
[13930] | 84 | CreateSolutionPath();
|
---|
| 85 | } else {
|
---|
[14395] | 86 | CreateSolution(Lambda.Value);
|
---|
[13930] | 87 | }
|
---|
| 88 | }
|
---|
| 89 |
|
---|
[15046] | 90 | private void CreateSolution(double lambda) {
|
---|
[14225] | 91 | double trainNMSE;
|
---|
| 92 | double testNMSE;
|
---|
[15046] | 93 | var coeff = CalculateModelCoefficients(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE);
|
---|
[14225] | 94 | Results.Add(new Result("NMSE (train)", new DoubleValue(trainNMSE)));
|
---|
| 95 | Results.Add(new Result("NMSE (test)", new DoubleValue(testNMSE)));
|
---|
[13930] | 96 |
|
---|
[15046] | 97 | var solution = CreateSymbolicSolution(coeff, Problem.ProblemData);
|
---|
| 98 | Results.Add(new Result(solution.Name, solution.Description, solution));
|
---|
| 99 | }
|
---|
| 100 |
|
---|
| 101 | public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData) {
|
---|
| 102 | var ds = problemData.Dataset;
|
---|
| 103 | var allVariables = problemData.AllowedInputVariables.ToArray();
|
---|
[15023] | 104 | var doubleVariables = allVariables.Where(ds.VariableHasType<double>);
|
---|
| 105 | var factorVariableNames = allVariables.Where(ds.VariableHasType<string>);
|
---|
| 106 | var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
|
---|
[13930] | 107 |
|
---|
[15023] | 108 | List<KeyValuePair<string, IEnumerable<string>>> remainingFactorVariablesAndValues = new List<KeyValuePair<string, IEnumerable<string>>>();
|
---|
| 109 | List<double> factorCoeff = new List<double>();
|
---|
| 110 | List<string> remainingDoubleVariables = new List<string>();
|
---|
| 111 | List<double> doubleVarCoeff = new List<double>();
|
---|
[13930] | 112 |
|
---|
[15023] | 113 | {
|
---|
| 114 | int i = 0;
|
---|
| 115 | // find factor varibles & value combinations with non-zero coeff
|
---|
| 116 | foreach (var factorVarAndValues in factorVariablesAndValues) {
|
---|
| 117 | var l = new List<string>();
|
---|
| 118 | foreach (var factorValue in factorVarAndValues.Value) {
|
---|
| 119 | if (!coeff[i].IsAlmost(0.0)) {
|
---|
| 120 | l.Add(factorValue);
|
---|
| 121 | factorCoeff.Add(coeff[i]);
|
---|
| 122 | }
|
---|
| 123 | i++;
|
---|
| 124 | }
|
---|
| 125 | if (l.Any()) remainingFactorVariablesAndValues.Add(new KeyValuePair<string, IEnumerable<string>>(factorVarAndValues.Key, l));
|
---|
| 126 | }
|
---|
| 127 | // find double variables with non-zero coeff
|
---|
| 128 | foreach (var doubleVar in doubleVariables) {
|
---|
| 129 | if (!coeff[i].IsAlmost(0.0)) {
|
---|
| 130 | remainingDoubleVariables.Add(doubleVar);
|
---|
| 131 | doubleVarCoeff.Add(coeff[i]);
|
---|
| 132 | }
|
---|
| 133 | i++;
|
---|
| 134 | }
|
---|
| 135 | }
|
---|
| 136 | var tree = LinearModelToTreeConverter.CreateTree(
|
---|
[15046] | 137 | remainingFactorVariablesAndValues, factorCoeff.ToArray(),
|
---|
[15023] | 138 | remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(),
|
---|
| 139 | coeff.Last());
|
---|
[13930] | 140 |
|
---|
[14395] | 141 |
|
---|
[13961] | 142 | SymbolicRegressionSolution solution = new SymbolicRegressionSolution(
|
---|
[15046] | 143 | new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()),
|
---|
| 144 | (IRegressionProblemData)problemData.Clone());
|
---|
[13930] | 145 | solution.Model.Name = "Elastic-net Linear Regression Model";
|
---|
| 146 | solution.Name = "Elastic-net Linear Regression Solution";
|
---|
| 147 |
|
---|
[15046] | 148 | return solution;
|
---|
[13930] | 149 | }
|
---|
| 150 |
|
---|
| 151 | private void CreateSolutionPath() {
|
---|
[13927] | 152 | double[] lambda;
|
---|
[14225] | 153 | double[] trainNMSE;
|
---|
| 154 | double[] testNMSE;
|
---|
[13927] | 155 | double[,] coeff;
|
---|
| 156 | double[] intercept;
|
---|
[14225] | 157 | RunElasticNetLinearRegression(Problem.ProblemData, Penality, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept);
|
---|
[13927] | 158 |
|
---|
[14375] | 159 | var coeffTable = new IndexedDataTable<double>("Coefficients", "The paths of standarized coefficient values over different lambda values");
|
---|
[14374] | 160 | coeffTable.VisualProperties.YAxisMaximumAuto = false;
|
---|
| 161 | coeffTable.VisualProperties.YAxisMinimumAuto = false;
|
---|
| 162 | coeffTable.VisualProperties.XAxisMaximumAuto = false;
|
---|
| 163 | coeffTable.VisualProperties.XAxisMinimumAuto = false;
|
---|
| 164 |
|
---|
| 165 | coeffTable.VisualProperties.XAxisLogScale = true;
|
---|
[14395] | 166 | coeffTable.VisualProperties.XAxisTitle = "Lambda";
|
---|
[14375] | 167 | coeffTable.VisualProperties.YAxisTitle = "Coefficients";
|
---|
[14374] | 168 | coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables";
|
---|
| 169 |
|
---|
[13927] | 170 | var nLambdas = lambda.Length;
|
---|
| 171 | var nCoeff = coeff.GetLength(1);
|
---|
[14374] | 172 | var dataRows = new IndexedDataRow<double>[nCoeff];
|
---|
[13928] | 173 | var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray();
|
---|
[14373] | 174 | var numNonZeroCoeffs = new int[nLambdas];
|
---|
[15023] | 175 |
|
---|
| 176 | var ds = Problem.ProblemData.Dataset;
|
---|
| 177 | var doubleVariables = allowedVars.Where(ds.VariableHasType<double>);
|
---|
| 178 | var factorVariableNames = allowedVars.Where(ds.VariableHasType<string>);
|
---|
| 179 | var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
|
---|
| 180 | {
|
---|
| 181 | int i = 0;
|
---|
| 182 | foreach (var factorVariableAndValues in factorVariablesAndValues) {
|
---|
| 183 | foreach (var factorValue in factorVariableAndValues.Value) {
|
---|
| 184 | double sigma = ds.GetStringValues(factorVariableAndValues.Key)
|
---|
| 185 | .Select(s => s == factorValue ? 1.0 : 0.0)
|
---|
| 186 | .StandardDeviation(); // calc std dev of binary indicator
|
---|
| 187 | var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
|
---|
| 188 | dataRows[i] = new IndexedDataRow<double>(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path);
|
---|
| 189 | i++;
|
---|
| 190 | }
|
---|
| 191 | }
|
---|
| 192 |
|
---|
| 193 | foreach (var doubleVariable in doubleVariables) {
|
---|
| 194 | double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation();
|
---|
| 195 | var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
|
---|
| 196 | dataRows[i] = new IndexedDataRow<double>(doubleVariable, doubleVariable, path);
|
---|
| 197 | i++;
|
---|
| 198 | }
|
---|
| 199 | // add to coeffTable by total weight (larger area under the curve => more important);
|
---|
| 200 | foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) {
|
---|
| 201 | coeffTable.Rows.Add(r);
|
---|
| 202 | }
|
---|
[13927] | 203 | }
|
---|
[14844] | 204 |
|
---|
[14373] | 205 | for (int i = 0; i < coeff.GetLength(0); i++) {
|
---|
| 206 | for (int j = 0; j < coeff.GetLength(1); j++) {
|
---|
| 207 | if (!coeff[i, j].IsAlmost(0.0)) {
|
---|
| 208 | numNonZeroCoeffs[i]++;
|
---|
| 209 | }
|
---|
| 210 | }
|
---|
| 211 | }
|
---|
[14374] | 212 | if (lambda.Length > 2) {
|
---|
| 213 | coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
|
---|
| 214 | coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
|
---|
| 215 | }
|
---|
| 216 | coeffTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
|
---|
| 217 | coeffTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
| 218 | coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
|
---|
[13927] | 219 |
|
---|
| 220 | Results.Add(new Result(coeffTable.Name, coeffTable.Description, coeffTable));
|
---|
| 221 |
|
---|
[14375] | 222 | var errorTable = new IndexedDataTable<double>("NMSE", "Path of NMSE values over different lambda values");
|
---|
| 223 | errorTable.VisualProperties.YAxisMaximumAuto = false;
|
---|
| 224 | errorTable.VisualProperties.YAxisMinimumAuto = false;
|
---|
| 225 | errorTable.VisualProperties.XAxisMaximumAuto = false;
|
---|
| 226 | errorTable.VisualProperties.XAxisMinimumAuto = false;
|
---|
[13940] | 227 |
|
---|
[14375] | 228 | errorTable.VisualProperties.YAxisMinimumFixedValue = 0;
|
---|
| 229 | errorTable.VisualProperties.YAxisMaximumFixedValue = 1.0;
|
---|
| 230 | errorTable.VisualProperties.XAxisLogScale = true;
|
---|
[14395] | 231 | errorTable.VisualProperties.XAxisTitle = "Lambda";
|
---|
[14375] | 232 | errorTable.VisualProperties.YAxisTitle = "Normalized mean of squared errors (NMSE)";
|
---|
[14395] | 233 | errorTable.VisualProperties.SecondYAxisTitle = "Number of variables";
|
---|
[14375] | 234 | errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (train)", "Path of NMSE values over different lambda values", lambda.Zip(trainNMSE, (l, v) => Tuple.Create(l, v))));
|
---|
| 235 | errorTable.Rows.Add(new IndexedDataRow<double>("NMSE (test)", "Path of NMSE values over different lambda values", lambda.Zip(testNMSE, (l, v) => Tuple.Create(l, v))));
|
---|
| 236 | errorTable.Rows.Add(new IndexedDataRow<double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
|
---|
[13940] | 237 | if (lambda.Length > 2) {
|
---|
[14375] | 238 | errorTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
|
---|
| 239 | errorTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
|
---|
[13940] | 240 | }
|
---|
[14375] | 241 | errorTable.Rows["NMSE (train)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
| 242 | errorTable.Rows["NMSE (test)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
| 243 | errorTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points;
|
---|
| 244 | errorTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;
|
---|
[14395] | 245 |
|
---|
[14375] | 246 | Results.Add(new Result(errorTable.Name, errorTable.Description, errorTable));
|
---|
[13927] | 247 | }
|
---|
| 248 |
|
---|
[15046] | 249 | public static double[] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double lambda,
|
---|
[14225] | 250 | out double trainNMSE, out double testNMSE,
|
---|
[13927] | 251 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) {
|
---|
[14225] | 252 | double[] trainNMSEs;
|
---|
| 253 | double[] testNMSEs;
|
---|
[13927] | 254 | // run for exactly one lambda
|
---|
[15046] | 255 | var coeffs = CalculateModelCoefficients(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound);
|
---|
[14225] | 256 | trainNMSE = trainNMSEs[0];
|
---|
| 257 | testNMSE = testNMSEs[0];
|
---|
[13927] | 258 | return coeffs[0];
|
---|
| 259 | }
|
---|
[15046] | 260 | public static double[][] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double[] lambda,
|
---|
[14370] | 261 | out double[] trainNMSEs, out double[] testNMSEs,
|
---|
[13927] | 262 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
|
---|
| 263 | int maxVars = -1) {
|
---|
| 264 | // run for multiple user-supplied lambdas
|
---|
| 265 | double[,] coeff;
|
---|
| 266 | double[] intercept;
|
---|
[14370] | 267 | RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainNMSEs, out testNMSEs, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
|
---|
[13927] | 268 |
|
---|
| 269 | int nRows = intercept.Length;
|
---|
| 270 | int nCols = coeff.GetLength(1) + 1;
|
---|
| 271 | double[][] sols = new double[nRows][];
|
---|
| 272 | for (int solIdx = 0; solIdx < nRows; solIdx++) {
|
---|
| 273 | sols[solIdx] = new double[nCols];
|
---|
| 274 | for (int cIdx = 0; cIdx < nCols - 1; cIdx++) {
|
---|
| 275 | sols[solIdx][cIdx] = coeff[solIdx, cIdx];
|
---|
| 276 | }
|
---|
| 277 | sols[solIdx][nCols - 1] = intercept[solIdx];
|
---|
| 278 | }
|
---|
| 279 | return sols;
|
---|
| 280 | }
|
---|
| 281 |
|
---|
| 282 | public static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
|
---|
[14225] | 283 | out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
|
---|
[13927] | 284 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
|
---|
| 285 | int maxVars = -1
|
---|
| 286 | ) {
|
---|
| 287 | double[] userLambda = new double[0];
|
---|
| 288 | // automatically determine lambda values (maximum 100 different lambda values)
|
---|
[14225] | 289 | RunElasticNetLinearRegression(problemData, penalty, 100, 0.0, userLambda, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);
|
---|
[13927] | 290 | }
|
---|
| 291 |
|
---|
| 292 | /// <summary>
|
---|
| 293 | /// Elastic net with squared-error-loss for dense predictor matrix, runs the full path of all lambdas
|
---|
| 294 | /// </summary>
|
---|
| 295 | /// <param name="problemData">Predictor target matrix x and target vector y</param>
|
---|
| 296 | /// <param name="penalty">Penalty for balance between ridge (0.0) and lasso (1.0) regression</param>
|
---|
| 297 | /// <param name="nlam">Maximum number of lambda values (default 100)</param>
|
---|
| 298 | /// <param name="flmin">User control of lambda values (<1.0 => minimum lambda = flmin * (largest lambda value), >= 1.0 => use supplied lambda values</param>
|
---|
| 299 | /// <param name="ulam">User supplied lambda values</param>
|
---|
| 300 | /// <param name="lambda">Output lambda values</param>
|
---|
[14225] | 301 | /// <param name="trainNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the training set for each set of coefficients along the path</param>
|
---|
| 302 | /// <param name="testNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the test set for each set of coefficients along the path</param>
|
---|
[13927] | 303 | /// <param name="coeff">Vector of coefficient vectors for each solution along the path</param>
|
---|
| 304 | /// <param name="intercept">Vector of intercepts for each solution along the path</param>
|
---|
| 305 | /// <param name="coeffLowerBound">Optional lower bound for all coefficients</param>
|
---|
| 306 | /// <param name="coeffUpperBound">Optional upper bound for all coefficients</param>
|
---|
| 307 | /// <param name="maxVars">Maximum allowed number of variables in each solution along the path (-1 => all variables are allowed)</param>
|
---|
| 308 | private static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
|
---|
[14225] | 309 | int nlam, double flmin, double[] ulam, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
|
---|
[13927] | 310 | double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
|
---|
| 311 | int maxVars = -1
|
---|
| 312 | ) {
|
---|
| 313 | if (penalty < 0.0 || penalty > 1.0) throw new ArgumentException("0 <= penalty <= 1", "penalty");
|
---|
| 314 |
|
---|
[13929] | 315 | double[,] trainX;
|
---|
| 316 | double[,] testX;
|
---|
| 317 | double[] trainY;
|
---|
| 318 | double[] testY;
|
---|
[13927] | 319 |
|
---|
[14395] | 320 | PrepareData(problemData, out trainX, out trainY, out testX, out testY);
|
---|
| 321 | var numTrainObs = trainX.GetLength(1);
|
---|
| 322 | var numTestObs = testX.GetLength(1);
|
---|
| 323 | var numVars = trainX.GetLength(0);
|
---|
| 324 |
|
---|
[13927] | 325 | int ka = 1; // => covariance updating algorithm
|
---|
| 326 | double parm = penalty;
|
---|
[13929] | 327 | double[] w = Enumerable.Repeat(1.0, numTrainObs).ToArray(); // all observations have the same weight
|
---|
[13927] | 328 | int[] jd = new int[1]; // do not force to use any of the variables
|
---|
| 329 | double[] vp = Enumerable.Repeat(1.0, numVars).ToArray(); // all predictor variables are unpenalized
|
---|
| 330 | double[,] cl = new double[numVars, 2]; // use the same bounds for all coefficients
|
---|
| 331 | for (int i = 0; i < numVars; i++) {
|
---|
| 332 | cl[i, 0] = coeffLowerBound;
|
---|
| 333 | cl[i, 1] = coeffUpperBound;
|
---|
| 334 | }
|
---|
| 335 |
|
---|
| 336 | int ne = maxVars > 0 ? maxVars : numVars;
|
---|
| 337 | int nx = numVars;
|
---|
| 338 | double thr = 1.0e-5; // default value as recommended in glmnet
|
---|
[13940] | 339 | int isd = 1; // => regression on standardized predictor variables
|
---|
[13927] | 340 | int intr = 1; // => do include intercept in model
|
---|
| 341 | int maxit = 100000; // default value as recommended in glmnet
|
---|
| 342 | // outputs
|
---|
| 343 | int lmu = -1;
|
---|
| 344 | double[,] ca;
|
---|
| 345 | int[] ia;
|
---|
| 346 | int[] nin;
|
---|
| 347 | int nlp = -99;
|
---|
| 348 | int jerr = -99;
|
---|
[14225] | 349 | double[] trainR2;
|
---|
[14370] | 350 | Glmnet.elnet(ka, parm, numTrainObs, numVars, trainX, trainY, w, jd, vp, cl, ne, nx, nlam, flmin, ulam, thr, isd, intr, maxit, out lmu, out intercept, out ca, out ia, out nin, out trainR2, out lambda, out nlp, out jerr);
|
---|
[13927] | 351 |
|
---|
[14225] | 352 | trainNMSE = new double[lmu]; // elnet returns R**2 as 1 - NMSE
|
---|
| 353 | testNMSE = new double[lmu];
|
---|
[13927] | 354 | coeff = new double[lmu, numVars];
|
---|
| 355 | for (int solIdx = 0; solIdx < lmu; solIdx++) {
|
---|
[14225] | 356 | trainNMSE[solIdx] = 1.0 - trainR2[solIdx];
|
---|
| 357 |
|
---|
[13927] | 358 | // uncompress coefficients of solution
|
---|
| 359 | int selectedNin = nin[solIdx];
|
---|
| 360 | double[] coefficients;
|
---|
| 361 | double[] selectedCa = new double[nx];
|
---|
[13929] | 362 | for (int i = 0; i < nx; i++) {
|
---|
| 363 | selectedCa[i] = ca[solIdx, i];
|
---|
| 364 | }
|
---|
[13927] | 365 |
|
---|
[14225] | 366 | // apply to test set to calculate test NMSE values for each lambda step
|
---|
[13929] | 367 | double[] fn;
|
---|
[14370] | 368 | Glmnet.modval(intercept[solIdx], selectedCa, ia, selectedNin, numTestObs, testX, out fn);
|
---|
[13929] | 369 | OnlineCalculatorError error;
|
---|
[14225] | 370 | var nmse = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(testY, fn, out error);
|
---|
[14461] | 371 | if (error != OnlineCalculatorError.None) nmse = double.NaN;
|
---|
[14225] | 372 | testNMSE[solIdx] = nmse;
|
---|
[13929] | 373 |
|
---|
| 374 | // uncompress coefficients
|
---|
[14370] | 375 | Glmnet.uncomp(numVars, selectedCa, ia, selectedNin, out coefficients);
|
---|
[13927] | 376 | for (int i = 0; i < coefficients.Length; i++) {
|
---|
| 377 | coeff[solIdx, i] = coefficients[i];
|
---|
| 378 | }
|
---|
| 379 | }
|
---|
| 380 | }
|
---|
| 381 |
|
---|
[14395] | 382 | private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY,
|
---|
| 383 | out double[,] testX, out double[] testY) {
|
---|
[15023] | 384 | var ds = problemData.Dataset;
|
---|
| 385 | var targetVariable = problemData.TargetVariable;
|
---|
| 386 | var allowedInputs = problemData.AllowedInputVariables;
|
---|
| 387 | trainX = PrepareInputData(ds, allowedInputs, problemData.TrainingIndices);
|
---|
| 388 | trainY = ds.GetDoubleValues(targetVariable, problemData.TrainingIndices).ToArray();
|
---|
[13927] | 389 |
|
---|
[15023] | 390 | testX = PrepareInputData(ds, allowedInputs, problemData.TestIndices);
|
---|
| 391 | testY = ds.GetDoubleValues(targetVariable, problemData.TestIndices).ToArray();
|
---|
[13927] | 392 | }
|
---|
[15023] | 393 |
|
---|
| 394 | private static double[,] PrepareInputData(IDataset ds, IEnumerable<string> allowedInputs, IEnumerable<int> rows) {
|
---|
| 395 | var doubleVariables = allowedInputs.Where(ds.VariableHasType<double>);
|
---|
| 396 | var factorVariableNames = allowedInputs.Where(ds.VariableHasType<string>);
|
---|
| 397 | var factorVariables = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
|
---|
| 398 | double[,] binaryMatrix = ds.ToArray(factorVariables, rows);
|
---|
| 399 | double[,] doubleVarMatrix = ds.ToArray(doubleVariables, rows);
|
---|
| 400 | var x = binaryMatrix.HorzCat(doubleVarMatrix);
|
---|
| 401 | return x.Transpose();
|
---|
| 402 | }
|
---|
[13927] | 403 | }
|
---|
| 404 | }
|
---|