source: branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/GAM.cs @ 15365

Last change on this file since 15365 was 15365, checked in by gkronber, 5 years ago

#2789 experiments with spline smoothing and additive models

File size: 10.1 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Concurrent;
24using System.Collections.Generic;
25using System.Linq;
26using System.Threading;
27using System.Threading.Tasks;
28using HeuristicLab.Analysis;
29using HeuristicLab.Common;
30using HeuristicLab.Core;
31using HeuristicLab.Data;
32using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
33using HeuristicLab.Optimization;
34using HeuristicLab.Parameters;
35using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
36using HeuristicLab.Problems.DataAnalysis;
37using HeuristicLab.Problems.DataAnalysis.Symbolic;
38using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;
39
40namespace HeuristicLab.Algorithms.DataAnalysis.Experimental {
41  // UNFINISHED
42  [Item("Generalized Additive Modelling", "GAM")]
43  [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 102)]
44  [StorableClass]
45  public sealed class GAM : FixedDataAnalysisAlgorithm<IRegressionProblem> {
46    [StorableConstructor]
47    private GAM(bool deserializing) : base(deserializing) { }
48    [StorableHook(HookType.AfterDeserialization)]
49    private void AfterDeserialization() {
50    }
51
52    private GAM(GAM original, Cloner cloner)
53      : base(original, cloner) {
54    }
55    public override IDeepCloneable Clone(Cloner cloner) {
56      return new GAM(this, cloner);
57    }
58
59    public GAM()
60      : base() {
61      Problem = new RegressionProblem();
62      Parameters.Add(new ValueParameter<DoubleValue>("Lambda", "Regularization for smoothing splines", new DoubleValue(1.0)));
63      Parameters.Add(new ValueParameter<IntValue>("Max iterations", "", new IntValue(100)));
64    }
65
66
67    protected override void Run(CancellationToken cancellationToken) {
68
69      double lambda = ((IValueParameter<DoubleValue>)Parameters["Lambda"]).Value.Value;
70      int maxIters = ((IValueParameter<IntValue>)Parameters["Max iterations"]).Value.Value;
71
72      // calculates a GAM model using a linear representation + independent non-linear functions of each variable
73      // using backfitting algorithm (see The Elements of Statistical Learning page 298)
74
75      var problemData = Problem.ProblemData;
76      var y = problemData.TargetVariableTrainingValues.ToArray();
77      var avgY = y.Average();
78      var inputVars = Problem.ProblemData.AllowedInputVariables.ToArray();
79      IRegressionModel[] f = new IRegressionModel[inputVars.Length * 2]; // linear(x) + nonlinear(x)
80      for(int i=0;i<f.Length;i++) {
81        f[i] = new ConstantModel(0.0, problemData.TargetVariable);
82      }
83
84      var rmseTable = new DataTable("RMSE");
85      var rmseRow = new DataRow("RMSE (train)");
86      var rmseRowTest = new DataRow("RMSE (test)");
87      rmseTable.Rows.Add(rmseRow);
88      rmseTable.Rows.Add(rmseRowTest);
89
90      Results.Add(new Result("RMSE", rmseTable));
91      rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors
92      rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation());
93
94      // until convergence
95      int iters = 0;
96      var t = new double[y.Length];
97      while (iters++ < maxIters) {
98        int j = 0;
99        foreach (var inputVar in inputVars) {
100          var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices);
101          f[j] = RegressLR(problemData, inputVar, res);
102          j++;
103        }
104        foreach (var inputVar in inputVars) {
105          var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices);
106          f[j] = RegressSpline(problemData, inputVar, res, lambda);
107          j++;
108        }
109
110        rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors
111        rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation());
112
113        if (cancellationToken.IsCancellationRequested) break;
114      }
115
116      var model = new RegressionEnsembleModel(f.Concat(new[] { new ConstantModel(avgY, problemData.TargetVariable) }));
117      model.AverageModelEstimates = false;
118      Results.Add(new Result("Ensemble solution", model.CreateRegressionSolution((IRegressionProblemData)problemData.Clone())));
119    }
120
121    private double[] CalculateResiduals(IRegressionProblemData problemData, IRegressionModel[] f, int j, double avgY, IEnumerable<int> rows) {
122      var y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
123      double[] t = y.Select(yi => yi - avgY).ToArray();
124      // collect other predictions
125      for (int k = 0; k < f.Length; k++) {
126        if (k != j) {
127          var pred = f[k].GetEstimatedValues(problemData.Dataset, rows).ToArray();
128          // determine target for this smoother
129          for (int i = 0; i < t.Length; i++) {
130            t[i] -= pred[i];
131          }
132        }
133      }
134      return t;
135    }
136
137    private IRegressionModel RegressLR(IRegressionProblemData problemData, string inputVar, double[] target) {
138      // Umständlich!
139      var ds = ((Dataset)problemData.Dataset).ToModifiable();
140      ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList<double>());
141      var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable);
142      pd.TrainingPartition.Start = problemData.TrainingPartition.Start;
143      pd.TrainingPartition.End = problemData.TrainingPartition.End;
144      pd.TestPartition.Start = problemData.TestPartition.Start;
145      pd.TestPartition.End = problemData.TestPartition.End;
146      double rmsError, cvRmsError;
147      return LinearRegression.CreateLinearRegressionSolution(pd, out rmsError, out cvRmsError).Model;
148    }
149
150    private IRegressionModel RegressSpline(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) {
151      if (problemData.Dataset.VariableHasType<double>(inputVar)) {
152        // Umständlich!
153        return Splines.CalculateSmoothingSplineReinschWithAutomaticTolerance(
154          problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToArray(),
155          (double[])target.Clone(),
156          new string[] { inputVar },
157          targetVar: problemData.TargetVariable);
158      } else return new ConstantModel(target.Average(), problemData.TargetVariable);
159    }
160    private IRegressionModel RegressRF(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) {
161      if (problemData.Dataset.VariableHasType<double>(inputVar)) {
162        // Umständlich!
163        var ds = ((Dataset)problemData.Dataset).ToModifiable();
164        ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList<double>());
165        var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable);
166        pd.TrainingPartition.Start = problemData.TrainingPartition.Start;
167        pd.TrainingPartition.End = problemData.TrainingPartition.End;
168        pd.TestPartition.Start = problemData.TestPartition.Start;
169        pd.TestPartition.End = problemData.TestPartition.End;
170        double rmsError, oobRmsError;
171        double avgRelError, oobAvgRelError;
172        return RandomForestRegression.CreateRandomForestRegressionModel(pd, 100, 0.5, 0.5, 1234, out rmsError, out avgRelError, out oobRmsError, out oobAvgRelError);
173      } else return new ConstantModel(target.Average(), problemData.TargetVariable);
174    }
175  }
176
177
178  // UNFINISHED
179  public class RBFModel : NamedItem, IRegressionModel {
180    private alglib.rbfmodel model;
181
182    public string TargetVariable { get; set; }
183
184    public IEnumerable<string> VariablesUsedForPrediction { get; private set; }
185    private ITransformation<double>[] scaling;
186
187    public event EventHandler TargetVariableChanged;
188
189    public RBFModel(RBFModel orig, Cloner cloner) : base(orig, cloner) {
190      this.TargetVariable = orig.TargetVariable;
191      this.VariablesUsedForPrediction = orig.VariablesUsedForPrediction.ToArray();
192      this.model = (alglib.rbfmodel)orig.model.make_copy();
193      this.scaling = orig.scaling.Select(s => cloner.Clone(s)).ToArray();
194    }
195    public RBFModel(alglib.rbfmodel model, string targetVar, string[] inputs, IEnumerable<ITransformation<double>> scaling) : base("RBFModel", "RBFModel") {
196      this.model = model;
197      this.TargetVariable = targetVar;
198      this.VariablesUsedForPrediction = inputs;
199      this.scaling = scaling.ToArray();
200    }
201
202    public override IDeepCloneable Clone(Cloner cloner) {
203      return new RBFModel(this, cloner);
204    }
205
206    public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
207      return new RegressionSolution(this, (IRegressionProblemData)problemData.Clone());
208    }
209
210    public IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
211      double[] x = new double[VariablesUsedForPrediction.Count()];
212      double[] y;
213      foreach (var r in rows) {
214        int c = 0;
215        foreach (var v in VariablesUsedForPrediction) {
216          x[c] = scaling[c].Apply(dataset.GetDoubleValue(v, r).ToEnumerable()).First(); // OUCH!
217          c++;
218        }
219        alglib.rbfcalc(model, x, out y);
220        yield return y[0];
221      }
222    }
223  }
224}
Note: See TracBrowser for help on using the repository browser.