Free cookie consent management tool by TermsFeed Policy Generator

source: branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafModels/PreconstructedLinearModel.cs @ 15833

Last change on this file since 15833 was 15833, checked in by bwerth, 6 years ago

#2847: added handling of empty and underdetermined data sets

File size: 8.4 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Diagnostics;
25using System.Linq;
26using HeuristicLab.Common;
27using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
28using HeuristicLab.Problems.DataAnalysis;
29
30namespace HeuristicLab.Algorithms.DataAnalysis {
31  //mulitdimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf
32  [StorableClass]
33  internal sealed class PreconstructedLinearModel : RegressionModel, IConfidenceRegressionModel {
34    [Storable]
35    public Dictionary<string, double> Coefficients { get; private set; }
36    [Storable]
37    public double Intercept { get; private set; }
38    [Storable]
39    private Dictionary<string, double> Means { get; set; }
40    [Storable]
41    private Dictionary<string, double> Variances { get; set; }
42    [Storable]
43    private double ResidualVariance { get; set; }
44    [Storable]
45    private int SampleSize { get; set; }
46
47    public override IEnumerable<string> VariablesUsedForPrediction {
48      get { return Coefficients.Keys; }
49    }
50    #region HLConstructors
51    [StorableConstructor]
52    private PreconstructedLinearModel(bool deserializing) : base(deserializing) { }
53    private PreconstructedLinearModel(PreconstructedLinearModel original, Cloner cloner) : base(original, cloner) {
54      if (original.Coefficients != null) Coefficients = original.Coefficients.ToDictionary(x => x.Key, x => x.Value);
55      Intercept = original.Intercept;
56      if (original.Means != null) Means = original.Means.ToDictionary(x => x.Key, x => x.Value);
57      if (original.Variances != null) Variances = original.Variances.ToDictionary(x => x.Key, x => x.Value);
58      ResidualVariance = original.ResidualVariance;
59      SampleSize = original.SampleSize;
60    }
61    public PreconstructedLinearModel(Dictionary<string, double> means, Dictionary<string, double> variances, Dictionary<string, double> coefficients, double intercept, string targetvariable, double residualVariance = 0, double sampleSize = 0) : base(targetvariable) {
62      Coefficients = coefficients;
63      Intercept = intercept;
64      Variances = variances;
65      Means = means;
66      ResidualVariance = 0;
67      SampleSize = 0;
68    }
69    public PreconstructedLinearModel(double intercept, string targetvariable) : base(targetvariable) {
70      Coefficients = new Dictionary<string, double>();
71      Intercept = intercept;
72      Variances = new Dictionary<string, double>();
73      ResidualVariance = 0;
74      SampleSize = 0;
75    }
76    public override IDeepCloneable Clone(Cloner cloner) {
77      return new PreconstructedLinearModel(this, cloner);
78    }
79    #endregion
80
81    public static PreconstructedLinearModel CreateConfidenceLinearModel(IRegressionProblemData pd, out double rmse, out double cvRmse) {
82      rmse = double.NaN;
83      cvRmse = double.NaN;
84      return AlternativeCalculation(pd);
85    }
86
87    private static PreconstructedLinearModel ClassicCalculation(IRegressionProblemData pd, out double rmse, out double cvRmse) {
88      var inputMatrix = pd.Dataset.ToArray(pd.AllowedInputVariables.Concat(new[] {
89        pd.TargetVariable
90      }), pd.AllIndices);
91
92      var nFeatures = inputMatrix.GetLength(1) - 1;
93      double[] coefficients;
94
95      alglib.linearmodel lm;
96      alglib.lrreport ar;
97      int retVal;
98      alglib.lrbuild(inputMatrix, inputMatrix.GetLength(0), nFeatures, out retVal, out lm, out ar);
99      if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution");
100      rmse = ar.rmserror;
101      cvRmse = ar.cvrmserror;
102
103      alglib.lrunpack(lm, out coefficients, out nFeatures);
104
105
106      var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Average());
107      var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Variance());
108      var coeffs = pd.AllowedInputVariables.Zip(coefficients, (s, d) => new {s, d}).ToDictionary(x => x.s, x => x.d);
109      var res = new PreconstructedLinearModel(means, variances, coeffs, coefficients[nFeatures], pd.TargetVariable);
110
111      res.ResidualVariance = pd.TargetVariableValues.Zip(res.GetEstimatedValues(pd.Dataset, pd.TrainingIndices), (x, y) => x - y).Variance();
112      res.SampleSize = pd.TrainingIndices.Count();
113      return res;
114    }
115
116    private static PreconstructedLinearModel AlternativeCalculation(IRegressionProblemData pd) {
117      var means = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Average());
118      var variances = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Variance());
119      var cmean = pd.TargetVariableTrainingValues.Average();
120      var variables = pd.AllowedInputVariables.ToList();
121      var n = variables.Count;
122      var m = pd.TrainingIndices.Count();
123
124      //Set up X^T and y
125      var inTr = new double[n + 1, m];
126      for (var i = 0; i < n; i++) {
127        var v = variables[i];
128        var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray();
129        for (var j = 0; j < m; j++) inTr[i, j] = vdata[j];
130      }
131
132      for (var i = 0; i < m; i++) inTr[n, i] = 1;
133
134      var y = new double[m, 1];
135      var ydata = pd.TargetVariableTrainingValues.ToArray();
136      for (var i = 0; i < m; i++) y[i, 0] = ydata[i];
137
138      //Perform linear regression
139      var aTy = new double[n + 1, 1];
140      alglib.rmatrixgemm(n + 1, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y;
141      var aTa = new double[n + 1, n + 1];
142      alglib.rmatrixgemm(n + 1, n + 1, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa //
143      alglib.spdmatrixcholesky(ref aTa, n + 1, true);
144      int info;
145      alglib.densesolverreport report;
146      double[] coefficients;
147      var aTyVector = new double[n + 1];
148      for (var i = 0; i < n + 1; i++) aTyVector[i] = aTy[i, 0];
149      alglib.spdmatrixcholeskysolve(aTa, n + 1, true, aTyVector, out info, out report, out coefficients);
150      double rmse, cvrmse;
151      if (info != 1) return ClassicCalculation(pd, out rmse, out cvrmse);
152
153      //extract coefficients
154      var intercept = coefficients[n];
155      var coeffs = new Dictionary<string, double>();
156      for (var i = 0; i < n; i++) coeffs.Add(variables[i], coefficients[i]);
157
158      return new PreconstructedLinearModel(means, variances, coeffs, intercept, pd.TargetVariable);
159    }
160
161    public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
162      return rows.Select(row => GetEstimatedValue(dataset, row));
163    }
164
165    public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
166      return new RegressionSolution(this, problemData);
167    }
168
169    public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) {
170      return rows.Select(i => GetEstimatedVariance(dataset, i));
171    }
172
173    #region helpers
174    private double GetEstimatedValue(IDataset dataset, int row) {
175      return Intercept + (Coefficients.Count == 0 ? 0 : Coefficients.Sum(s => s.Value * dataset.GetDoubleValue(s.Key, row)));
176    }
177    private double GetEstimatedVariance(IDataset dataset, int row) {
178      if (SampleSize == 0) return 0.0;
179      var sum = (from var in Variances let d = dataset.GetDoubleValue(var.Key, row) - Means[var.Key] select d * d / var.Value).Sum();
180      var res = ResidualVariance * (SampleSize - 1) / (SampleSize - 2) * (1.0 / SampleSize + sum / (SampleSize - 1));
181      if (double.IsInfinity(res) || double.IsNaN(res)) return 0.0;
182      return Math.Sqrt(res);
183    }
184    #endregion
185  }
186}
Note: See TracBrowser for help on using the repository browser.