Changeset 15147
 Timestamp:
 07/06/17 11:26:22 (3 months ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

stable/HeuristicLab.Algorithms.DataAnalysis.Glmnet/3.4/ElasticNetLinearRegression.cs
r14846 r15147 21 21 22 22 using System; 23 using System.Collections.Generic; 23 24 using System.Linq; 24 25 using System.Threading; … … 87 88 } 88 89 89 90 private void CreateSolution(double lambda) { 90 91 double trainNMSE; 91 92 double testNMSE; 92 var coeff = C reateElasticNetLinearRegressionSolution(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE);93 var coeff = CalculateModelCoefficients(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE); 93 94 Results.Add(new Result("NMSE (train)", new DoubleValue(trainNMSE))); 94 95 Results.Add(new Result("NMSE (test)", new DoubleValue(testNMSE))); 95 96 96 var allVariables = Problem.ProblemData.AllowedInputVariables.ToArray(); 97 98 var remainingVars = Enumerable.Range(0, allVariables.Length) 99 .Where(idx => !coeff[idx].IsAlmost(0.0)).Select(idx => allVariables[idx]) 100 .ToArray(); 101 var remainingCoeff = Enumerable.Range(0, allVariables.Length) 102 .Select(idx => coeff[idx]) 103 .Where(c => !c.IsAlmost(0.0)) 104 .ToArray(); 105 106 var tree = LinearModelToTreeConverter.CreateTree(remainingVars, remainingCoeff, coeff.Last()); 97 var solution = CreateSymbolicSolution(coeff, Problem.ProblemData); 98 Results.Add(new Result(solution.Name, solution.Description, solution)); 99 } 100 101 public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData) { 102 var ds = problemData.Dataset; 103 var allVariables = problemData.AllowedInputVariables.ToArray(); 104 var doubleVariables = allVariables.Where(ds.VariableHasType<double>); 105 var factorVariableNames = allVariables.Where(ds.VariableHasType<string>); 106 var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set) 107 108 List<KeyValuePair<string, IEnumerable<string>>> remainingFactorVariablesAndValues = new List<KeyValuePair<string, IEnumerable<string>>>(); 109 List<double> factorCoeff = new List<double>(); 110 List<string> remainingDoubleVariables = new List<string>(); 111 List<double> doubleVarCoeff = new List<double>(); 112 113 { 114 int i = 0; 115 // find factor varibles & value combinations with nonzero coeff 116 foreach (var factorVarAndValues in factorVariablesAndValues) { 117 var l = new List<string>(); 118 foreach (var factorValue in factorVarAndValues.Value) { 119 if (!coeff[i].IsAlmost(0.0)) { 120 l.Add(factorValue); 121 factorCoeff.Add(coeff[i]); 122 } 123 i++; 124 } 125 if (l.Any()) remainingFactorVariablesAndValues.Add(new KeyValuePair<string, IEnumerable<string>>(factorVarAndValues.Key, l)); 126 } 127 // find double variables with nonzero coeff 128 foreach (var doubleVar in doubleVariables) { 129 if (!coeff[i].IsAlmost(0.0)) { 130 remainingDoubleVariables.Add(doubleVar); 131 doubleVarCoeff.Add(coeff[i]); 132 } 133 i++; 134 } 135 } 136 var tree = LinearModelToTreeConverter.CreateTree( 137 remainingFactorVariablesAndValues, factorCoeff.ToArray(), 138 remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(), 139 coeff.Last()); 107 140 108 141 109 142 SymbolicRegressionSolution solution = new SymbolicRegressionSolution( 110 new SymbolicRegressionModel( Problem.ProblemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()),111 (IRegressionProblemData) Problem.ProblemData.Clone());143 new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()), 144 (IRegressionProblemData)problemData.Clone()); 112 145 solution.Model.Name = "Elasticnet Linear Regression Model"; 113 146 solution.Name = "Elasticnet Linear Regression Solution"; 114 147 115 Results.Add(new Result(solution.Name, solution.Description, solution));148 return solution; 116 149 } 117 150 … … 140 173 var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray(); 141 174 var numNonZeroCoeffs = new int[nLambdas]; 142 for (int i = 0; i < nCoeff; i++) { 143 var coeffId = allowedVars[i]; 144 double sigma = Problem.ProblemData.Dataset.GetDoubleValues(coeffId).StandardDeviation(); 145 var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray(); 146 dataRows[i] = new IndexedDataRow<double>(coeffId, coeffId, path); 147 } 148 // add to coeffTable by total weight (larger area under the curve => more important); 149 foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) { 150 coeffTable.Rows.Add(r); 175 176 var ds = Problem.ProblemData.Dataset; 177 var doubleVariables = allowedVars.Where(ds.VariableHasType<double>); 178 var factorVariableNames = allowedVars.Where(ds.VariableHasType<string>); 179 var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set) 180 { 181 int i = 0; 182 foreach (var factorVariableAndValues in factorVariablesAndValues) { 183 foreach (var factorValue in factorVariableAndValues.Value) { 184 double sigma = ds.GetStringValues(factorVariableAndValues.Key) 185 .Select(s => s == factorValue ? 1.0 : 0.0) 186 .StandardDeviation(); // calc std dev of binary indicator 187 var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray(); 188 dataRows[i] = new IndexedDataRow<double>(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path); 189 i++; 190 } 191 } 192 193 foreach (var doubleVariable in doubleVariables) { 194 double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation(); 195 var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray(); 196 dataRows[i] = new IndexedDataRow<double>(doubleVariable, doubleVariable, path); 197 i++; 198 } 199 // add to coeffTable by total weight (larger area under the curve => more important); 200 foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) { 201 coeffTable.Rows.Add(r); 202 } 151 203 } 152 204 … … 195 247 } 196 248 197 public static double[] C reateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double lambda,249 public static double[] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double lambda, 198 250 out double trainNMSE, out double testNMSE, 199 251 double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) { … … 201 253 double[] testNMSEs; 202 254 // run for exactly one lambda 203 var coeffs = C reateElasticNetLinearRegressionSolution(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound);255 var coeffs = CalculateModelCoefficients(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound); 204 256 trainNMSE = trainNMSEs[0]; 205 257 testNMSE = testNMSEs[0]; 206 258 return coeffs[0]; 207 259 } 208 public static double[][] C reateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double[] lambda,260 public static double[][] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double[] lambda, 209 261 out double[] trainNMSEs, out double[] testNMSEs, 210 262 double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, … … 330 382 private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY, 331 383 out double[,] testX, out double[] testY) { 332 333 384 var ds = problemData.Dataset; 334 trainX = ds.ToArray(problemData.AllowedInputVariables, problemData.TrainingIndices); 335 trainX = trainX.Transpose(); 336 trainY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, 337 problemData.TrainingIndices) 338 .ToArray(); 339 testX = ds.ToArray(problemData.AllowedInputVariables, problemData.TestIndices); 340 testX = testX.Transpose(); 341 testY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, 342 problemData.TestIndices) 343 .ToArray(); 385 var targetVariable = problemData.TargetVariable; 386 var allowedInputs = problemData.AllowedInputVariables; 387 trainX = PrepareInputData(ds, allowedInputs, problemData.TrainingIndices); 388 trainY = ds.GetDoubleValues(targetVariable, problemData.TrainingIndices).ToArray(); 389 390 testX = PrepareInputData(ds, allowedInputs, problemData.TestIndices); 391 testY = ds.GetDoubleValues(targetVariable, problemData.TestIndices).ToArray(); 392 } 393 394 private static double[,] PrepareInputData(IDataset ds, IEnumerable<string> allowedInputs, IEnumerable<int> rows) { 395 var doubleVariables = allowedInputs.Where(ds.VariableHasType<double>); 396 var factorVariableNames = allowedInputs.Where(ds.VariableHasType<string>); 397 var factorVariables = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set) 398 double[,] binaryMatrix = ds.ToArray(factorVariables, rows); 399 double[,] doubleVarMatrix = ds.ToArray(doubleVariables, rows); 400 var x = binaryMatrix.HorzCat(doubleVarMatrix); 401 return x.Transpose(); 344 402 } 345 403 }
Note: See TracChangeset
for help on using the changeset viewer.