source: branches/2847_M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafTypes/M5Leaf.cs @ 16847

Last change on this file since 16847 was 16847, checked in by gkronber, 2 months ago

#2847: made some minor changes while reviewing

File size: 8.8 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using System.Threading;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Problems.DataAnalysis;
29using HEAL.Attic;
30
31namespace HeuristicLab.Algorithms.DataAnalysis {
32  [StorableType("58517042-5318-4087-B098-AC75F0208BA0")]
33  [Item("M5Leaf", "A leaf type that uses regularized linear models with feature selection as leaf models.")]
34  public class M5Leaf : LeafBase {
35    #region Constructors & Cloning
36    [StorableConstructor]
37    private M5Leaf(StorableConstructorFlag _) : base(_) { }
38    private M5Leaf(M5Leaf original, Cloner cloner) : base(original, cloner) { }
39    public M5Leaf() { }
40    public override IDeepCloneable Clone(Cloner cloner) {
41      return new M5Leaf(this, cloner);
42    }
43    #endregion
44
45    #region IModelType
46    public override bool ProvidesConfidence {
47      get { return false; }
48    }
49    public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters) {
50      if (pd.Dataset.Rows == 0) throw new ArgumentException("The number of training instances is too small to create an M5 leaf model");
51
52      if (pd.Dataset.Rows == 1)
53        return new ConstantLeaf().Build(pd, random, cancellationToken, out numberOfParameters);
54
55      var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Average());
56      var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Variance());
57      var used = pd.AllowedInputVariables.Where(v => !variances[v].IsAlmost(0.0)).ToList();
58
59      var targetMean = pd.TargetVariableTrainingValues.Average();
60      var targetVariance = pd.TargetVariableTrainingValues.Variance();
61
62      var model = FindBestModel(variances, means, targetMean, targetVariance, pd, used);
63      numberOfParameters = 1 + model.Coefficients.Count;
64      return model;
65    }
66    public override int MinLeafSize(IRegressionProblemData pd) {
67      return 1;
68    }
69    #endregion
70
71    private static PreconstructedLinearModel FindBestModel(Dictionary<string, double> variances, Dictionary<string, double> means, double yMean, double yVariance, IRegressionProblemData pd, IList<string> variables) {
72      Dictionary<string, double> coeffs;
73      double intercept;
74      do {
75        coeffs = DoRegression(pd, variables, variances, means, yMean, 1.0e-8, out intercept);
76        variables = DeselectColinear(variances, coeffs, yVariance, pd, variables);
77      }
78      while (coeffs.Count != variables.Count);
79      var numAtts = variables.Count;
80      var numInst = pd.TrainingIndices.Count();
81      var fullMse = CalculateSE(coeffs, intercept, pd, variables);
82      var akaike = 1.0 * (numInst - numAtts) + 2 * numAtts;
83
84      var improved = true;
85      var currentNumAttributes = numAtts;
86
87      while (improved && currentNumAttributes > 1) {
88        improved = false;
89        currentNumAttributes--;
90        // Find attribute with smallest SC (variance-scaled coefficient)
91        var candidate = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance)))
92          .OrderBy(x => x.Value).Select(x => x.Key).First();
93
94        var currVariables = variables.Where(v => !v.Equals(candidate)).ToList();
95        var currentIntercept = 0.0;
96        var currentCoeffs = DoRegression(pd, currVariables, variances, means, yMean, 1.0e-8, out currentIntercept);
97        var currentMse = CalculateSE(currentCoeffs, currentIntercept, pd, currVariables);
98        var currentAkaike = currentMse / fullMse * (numInst - numAtts) + 2 * currentNumAttributes;
99
100        if (!(currentAkaike < akaike)) continue;
101        improved = true;
102        akaike = currentAkaike;
103        coeffs = currentCoeffs;
104        intercept = currentIntercept;
105        variables = currVariables;
106      }
107
108      var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable);
109      pd2.TestPartition.End = pd.TestPartition.End;
110      pd2.TestPartition.Start = pd.TestPartition.Start;
111      pd2.TrainingPartition.End = pd.TrainingPartition.End;
112      pd2.TrainingPartition.Start = pd.TrainingPartition.Start;
113
114      return new PreconstructedLinearModel(coeffs, intercept, pd.TargetVariable);
115    }
116
117    private static Dictionary<string, double> DoRegression(IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> variances, Dictionary<string, double> means, double yMean, double ridge, out double intercept) {
118
119      var n = variables.Count;
120      var m = pd.TrainingIndices.Count();
121
122      var inTr = new double[n, m];
123      for (var i = 0; i < n; i++) {
124        var v = variables[i];
125        var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray();
126        var sd = Math.Sqrt(variances[v]);
127        var mean = means[v];
128        for (var j = 0; j < m; j++) {
129          inTr[i, j] = (vdata[j] - mean) / sd;
130        }
131      }
132
133      var y = new double[m, 1];
134      var ydata = pd.TargetVariableTrainingValues.ToArray();
135      for (var i = 0; i < m; i++)
136        y[i, 0] = ydata[i]; //no scaling for targets;
137
138
139      var aTy = new double[n, 1];
140      alglib.rmatrixgemm(n, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y;
141      var aTa = new double[n, n];
142      alglib.rmatrixgemm(n, n, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa //
143
144      var aTaDecomp = new double[n, n];
145      bool success;
146      var tries = 0;
147      double[] coefficients = null;
148      do {
149        for (var i = 0; i < n; i++) aTa[i, i] += ridge; // add ridge to diagonal to enforce singularity
150        try {
151          //solve "aTa * coefficients = aTy" for coefficients;
152          Array.Copy(aTa, 0, aTaDecomp, 0, aTa.Length);
153          alglib.spdmatrixcholesky(ref aTaDecomp, n, true);
154          int info;
155          alglib.densesolverreport report;
156          alglib.spdmatrixcholeskysolve(aTaDecomp, n, true, ydata, out info, out report, out coefficients);
157
158          if (info != 1) throw new Exception();
159          success = true;
160        }
161        catch (Exception) {
162          for (var i = 0; i < n; i++) aTa[i, i] -= ridge;
163          ridge *= 10; // increase ridge;
164          success = false;
165        }
166        finally {
167          tries++;
168        }
169      }
170      while (!success && tries < 100);
171      if (coefficients == null) throw new ArgumentException("No linear model could be built");
172
173      intercept = yMean;
174      var res = new Dictionary<string, double>();
175      for (var i = 0; i < n; i++) {
176        var v = variables[i];
177        res.Add(v, coefficients[i] /= Math.Sqrt(variances[v]));
178        intercept -= coefficients[i] * means[v];
179      }
180
181      return res;
182    }
183
184    private static IList<string> DeselectColinear(Dictionary<string, double> variances, Dictionary<string, double> coeffs, double yVariance, IRegressionProblemData pd, IList<string> variables) {
185      var candidates = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance))).Where(x => x.Value > 1.5).OrderBy(x => -x.Value).ToList();
186      if (candidates.Count == 0) return variables;
187      var c = candidates.First().Key;
188      return variables.Where(v => !v.Equals(c)).ToList();
189    }
190
191    private static double CalculateSE(Dictionary<string, double> coefficients, double intercept, IRegressionProblemData pd, IList<string> variables) {
192      return pd.TrainingIndices.Select(i => RegressionPrediction(i, pd, variables, coefficients, intercept) - pd.Dataset.GetDoubleValue(pd.TargetVariable, i)).Select(error => error * error).Sum();
193    }
194    private static double RegressionPrediction(int i, IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> coefficients, double intercept) {
195      return intercept + variables.Sum(v => pd.Dataset.GetDoubleValue(v, i) * coefficients[v]);
196    }
197  }
198}
Note: See TracBrowser for help on using the repository browser.