Changeset 15147


Ignore:
Timestamp:
07/06/17 11:26:22 (3 months ago)
Author:
gkronber
Message:

#745: merged r15023,r15046 from trunk to stable

File:
1 edited

Legend:

Unmodified
Added
Removed
  • stable/HeuristicLab.Algorithms.DataAnalysis.Glmnet/3.4/ElasticNetLinearRegression.cs

    r14846 r15147  
    2121
    2222using System;
     23using System.Collections.Generic;
    2324using System.Linq;
    2425using System.Threading;
     
    8788    }
    8889
    89     private void CreateSolution(double lambda) {
     90  private void CreateSolution(double lambda) {
    9091      double trainNMSE;
    9192      double testNMSE;
    92       var coeff = CreateElasticNetLinearRegressionSolution(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE);
     93      var coeff = CalculateModelCoefficients(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE);
    9394      Results.Add(new Result("NMSE (train)", new DoubleValue(trainNMSE)));
    9495      Results.Add(new Result("NMSE (test)", new DoubleValue(testNMSE)));
    9596
    96       var allVariables = Problem.ProblemData.AllowedInputVariables.ToArray();
    97 
    98       var remainingVars = Enumerable.Range(0, allVariables.Length)
    99         .Where(idx => !coeff[idx].IsAlmost(0.0)).Select(idx => allVariables[idx])
    100         .ToArray();
    101       var remainingCoeff = Enumerable.Range(0, allVariables.Length)
    102         .Select(idx => coeff[idx])
    103         .Where(c => !c.IsAlmost(0.0))
    104         .ToArray();
    105 
    106       var tree = LinearModelToTreeConverter.CreateTree(remainingVars, remainingCoeff, coeff.Last());
     97      var solution = CreateSymbolicSolution(coeff, Problem.ProblemData);
     98      Results.Add(new Result(solution.Name, solution.Description, solution));
     99    }
     100
     101    public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData) {
     102      var ds = problemData.Dataset;
     103      var allVariables = problemData.AllowedInputVariables.ToArray();
     104      var doubleVariables = allVariables.Where(ds.VariableHasType<double>);
     105      var factorVariableNames = allVariables.Where(ds.VariableHasType<string>);
     106      var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
     107
     108      List<KeyValuePair<string, IEnumerable<string>>> remainingFactorVariablesAndValues = new List<KeyValuePair<string, IEnumerable<string>>>();
     109      List<double> factorCoeff = new List<double>();
     110      List<string> remainingDoubleVariables = new List<string>();
     111      List<double> doubleVarCoeff = new List<double>();
     112
     113      {
     114        int i = 0;
     115        // find factor varibles & value combinations with non-zero coeff
     116        foreach (var factorVarAndValues in factorVariablesAndValues) {
     117          var l = new List<string>();
     118          foreach (var factorValue in factorVarAndValues.Value) {
     119            if (!coeff[i].IsAlmost(0.0)) {
     120              l.Add(factorValue);
     121              factorCoeff.Add(coeff[i]);
     122            }
     123            i++;
     124          }
     125          if (l.Any()) remainingFactorVariablesAndValues.Add(new KeyValuePair<string, IEnumerable<string>>(factorVarAndValues.Key, l));
     126        }
     127        // find double variables with non-zero coeff
     128        foreach (var doubleVar in doubleVariables) {
     129          if (!coeff[i].IsAlmost(0.0)) {
     130            remainingDoubleVariables.Add(doubleVar);
     131            doubleVarCoeff.Add(coeff[i]);
     132          }
     133          i++;
     134        }
     135      }
     136      var tree = LinearModelToTreeConverter.CreateTree(
     137        remainingFactorVariablesAndValues, factorCoeff.ToArray(),
     138        remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(),
     139        coeff.Last());
    107140
    108141
    109142      SymbolicRegressionSolution solution = new SymbolicRegressionSolution(
    110         new SymbolicRegressionModel(Problem.ProblemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()),
    111         (IRegressionProblemData)Problem.ProblemData.Clone());
     143        new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()),
     144        (IRegressionProblemData)problemData.Clone());
    112145      solution.Model.Name = "Elastic-net Linear Regression Model";
    113146      solution.Name = "Elastic-net Linear Regression Solution";
    114147
    115       Results.Add(new Result(solution.Name, solution.Description, solution));
     148      return solution;
    116149    }
    117150
     
    140173      var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray();
    141174      var numNonZeroCoeffs = new int[nLambdas];
    142       for (int i = 0; i < nCoeff; i++) {
    143         var coeffId = allowedVars[i];
    144         double sigma = Problem.ProblemData.Dataset.GetDoubleValues(coeffId).StandardDeviation();
    145         var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
    146         dataRows[i] = new IndexedDataRow<double>(coeffId, coeffId, path);
    147       }
    148       // add to coeffTable by total weight (larger area under the curve => more important);
    149       foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) {
    150         coeffTable.Rows.Add(r);
     175
     176      var ds = Problem.ProblemData.Dataset;
     177      var doubleVariables = allowedVars.Where(ds.VariableHasType<double>);
     178      var factorVariableNames = allowedVars.Where(ds.VariableHasType<string>);
     179      var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
     180      {
     181        int i = 0;
     182        foreach (var factorVariableAndValues in factorVariablesAndValues) {
     183          foreach (var factorValue in factorVariableAndValues.Value) {
     184            double sigma = ds.GetStringValues(factorVariableAndValues.Key)
     185              .Select(s => s == factorValue ? 1.0 : 0.0)
     186              .StandardDeviation(); // calc std dev of binary indicator
     187            var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
     188            dataRows[i] = new IndexedDataRow<double>(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path);
     189            i++;
     190          }
     191        }
     192
     193        foreach (var doubleVariable in doubleVariables) {
     194          double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation();
     195          var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
     196          dataRows[i] = new IndexedDataRow<double>(doubleVariable, doubleVariable, path);
     197          i++;
     198        }
     199        // add to coeffTable by total weight (larger area under the curve => more important);
     200        foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) {
     201          coeffTable.Rows.Add(r);
     202        }
    151203      }
    152204
     
    195247    }
    196248
    197     public static double[] CreateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double lambda,
     249    public static double[] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double lambda,
    198250            out double trainNMSE, out double testNMSE,
    199251            double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) {
     
    201253      double[] testNMSEs;
    202254      // run for exactly one lambda
    203       var coeffs = CreateElasticNetLinearRegressionSolution(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound);
     255      var coeffs = CalculateModelCoefficients(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound);
    204256      trainNMSE = trainNMSEs[0];
    205257      testNMSE = testNMSEs[0];
    206258      return coeffs[0];
    207259    }
    208     public static double[][] CreateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double[] lambda,
     260    public static double[][] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double[] lambda,
    209261            out double[] trainNMSEs, out double[] testNMSEs,
    210262            double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
     
    330382    private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY,
    331383      out double[,] testX, out double[] testY) {
    332 
    333384      var ds = problemData.Dataset;
    334       trainX = ds.ToArray(problemData.AllowedInputVariables, problemData.TrainingIndices);
    335       trainX = trainX.Transpose();
    336       trainY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable,
    337         problemData.TrainingIndices)
    338         .ToArray();
    339       testX = ds.ToArray(problemData.AllowedInputVariables, problemData.TestIndices);
    340       testX = testX.Transpose();
    341       testY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable,
    342         problemData.TestIndices)
    343         .ToArray();
     385      var targetVariable = problemData.TargetVariable;
     386      var allowedInputs = problemData.AllowedInputVariables;
     387      trainX = PrepareInputData(ds, allowedInputs, problemData.TrainingIndices);
     388      trainY = ds.GetDoubleValues(targetVariable, problemData.TrainingIndices).ToArray();
     389
     390      testX = PrepareInputData(ds, allowedInputs, problemData.TestIndices);
     391      testY = ds.GetDoubleValues(targetVariable, problemData.TestIndices).ToArray();
     392    }
     393
     394    private static double[,] PrepareInputData(IDataset ds, IEnumerable<string> allowedInputs, IEnumerable<int> rows) {
     395      var doubleVariables = allowedInputs.Where(ds.VariableHasType<double>);
     396      var factorVariableNames = allowedInputs.Where(ds.VariableHasType<string>);
     397      var factorVariables = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)
     398      double[,] binaryMatrix = ds.ToArray(factorVariables, rows);
     399      double[,] doubleVarMatrix = ds.ToArray(doubleVariables, rows);
     400      var x = binaryMatrix.HorzCat(doubleVarMatrix);
     401      return x.Transpose();
    344402    }
    345403  }
Note: See TracChangeset for help on using the changeset viewer.