source: stable/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionVariableImpactsCalculator.cs @ 16435

Last change on this file since 16435 was 16435, checked in by mkommend, 8 months ago

#2871: Merged r15626, r15637, r15665, r15673, r15727, r15728, r15752, r15796, r15797, r15798, r15799, r15802, r15998, r15999, r16015, r16021, r16023 into stable.

File size: 14.1 KB
Line 
1#region License Information
2
3/* HeuristicLab
4 * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
5 *
6 * This file is part of HeuristicLab.
7 *
8 * HeuristicLab is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * HeuristicLab is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22#endregion
23
24using System;
25using System.Collections.Generic;
26using System.Linq;
27using HeuristicLab.Common;
28using HeuristicLab.Core;
29using HeuristicLab.Data;
30using HeuristicLab.Parameters;
31using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
32using HeuristicLab.Random;
33
34namespace HeuristicLab.Problems.DataAnalysis {
35  [StorableClass]
36  [Item("RegressionSolution Impacts Calculator", "Calculation of the impacts of input variables for any regression solution")]
37  public sealed class RegressionSolutionVariableImpactsCalculator : ParameterizedNamedItem {
38    public enum ReplacementMethodEnum {
39      Median,
40      Average,
41      Shuffle,
42      Noise
43    }
44    public enum FactorReplacementMethodEnum {
45      Best,
46      Mode,
47      Shuffle
48    }
49    public enum DataPartitionEnum {
50      Training,
51      Test,
52      All
53    }
54
55    private const string ReplacementParameterName = "Replacement Method";
56    private const string DataPartitionParameterName = "DataPartition";
57
58    public IFixedValueParameter<EnumValue<ReplacementMethodEnum>> ReplacementParameter {
59      get { return (IFixedValueParameter<EnumValue<ReplacementMethodEnum>>)Parameters[ReplacementParameterName]; }
60    }
61    public IFixedValueParameter<EnumValue<DataPartitionEnum>> DataPartitionParameter {
62      get { return (IFixedValueParameter<EnumValue<DataPartitionEnum>>)Parameters[DataPartitionParameterName]; }
63    }
64
65    public ReplacementMethodEnum ReplacementMethod {
66      get { return ReplacementParameter.Value.Value; }
67      set { ReplacementParameter.Value.Value = value; }
68    }
69    public DataPartitionEnum DataPartition {
70      get { return DataPartitionParameter.Value.Value; }
71      set { DataPartitionParameter.Value.Value = value; }
72    }
73
74
75    [StorableConstructor]
76    private RegressionSolutionVariableImpactsCalculator(bool deserializing) : base(deserializing) { }
77    private RegressionSolutionVariableImpactsCalculator(RegressionSolutionVariableImpactsCalculator original, Cloner cloner)
78      : base(original, cloner) { }
79    public override IDeepCloneable Clone(Cloner cloner) {
80      return new RegressionSolutionVariableImpactsCalculator(this, cloner);
81    }
82
83    public RegressionSolutionVariableImpactsCalculator()
84      : base() {
85      Parameters.Add(new FixedValueParameter<EnumValue<ReplacementMethodEnum>>(ReplacementParameterName, "The replacement method for variables during impact calculation.", new EnumValue<ReplacementMethodEnum>(ReplacementMethodEnum.Median)));
86      Parameters.Add(new FixedValueParameter<EnumValue<DataPartitionEnum>>(DataPartitionParameterName, "The data partition on which the impacts are calculated.", new EnumValue<DataPartitionEnum>(DataPartitionEnum.Training)));
87    }
88
89    //mkommend: annoying name clash with static method, open to better naming suggestions
90    public IEnumerable<Tuple<string, double>> Calculate(IRegressionSolution solution) {
91      return CalculateImpacts(solution, DataPartition, ReplacementMethod);
92    }
93
94    public static IEnumerable<Tuple<string, double>> CalculateImpacts(
95      IRegressionSolution solution,
96      DataPartitionEnum data = DataPartitionEnum.Training,
97      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median,
98      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
99      Func<double, string, bool> progressCallback = null) {
100
101      var problemData = solution.ProblemData;
102      var dataset = problemData.Dataset;
103
104      IEnumerable<int> rows;
105      IEnumerable<double> targetValues;
106      double originalR2 = -1;
107
108      OnlineCalculatorError error;
109
110      switch (data) {
111        case DataPartitionEnum.All:
112          rows = solution.ProblemData.AllIndices;
113          targetValues = problemData.TargetVariableValues.ToList();
114          originalR2 = OnlinePearsonsRCalculator.Calculate(problemData.TargetVariableValues, solution.EstimatedValues, out error);
115          if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation.");
116          originalR2 = originalR2 * originalR2;
117          break;
118        case DataPartitionEnum.Training:
119          rows = problemData.TrainingIndices;
120          targetValues = problemData.TargetVariableTrainingValues.ToList();
121          originalR2 = solution.TrainingRSquared;
122          break;
123        case DataPartitionEnum.Test:
124          rows = problemData.TestIndices;
125          targetValues = problemData.TargetVariableTestValues.ToList();
126          originalR2 = solution.TestRSquared;
127          break;
128        default: throw new ArgumentException(string.Format("DataPartition {0} cannot be handled.", data));
129      }
130
131      var impacts = new Dictionary<string, double>();
132      var modifiableDataset = ((Dataset)dataset).ToModifiable();
133
134      var inputvariables = new HashSet<string>(problemData.AllowedInputVariables.Union(solution.Model.VariablesUsedForPrediction));
135      var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList();
136
137      int curIdx = 0;
138      int count = allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>).Count();
139      // calculate impacts for double variables
140      foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
141        //Report the current progress in percent. If the callback returns true, it means the execution shall be stopped
142        if (progressCallback != null) {
143          curIdx++;
144          if (progressCallback((double)curIdx / count, string.Format("Calculating impact for variable {0} ({1} of {2})", inputVariable, curIdx, count))) { return null; }
145        }
146        var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);
147        var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
148        if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
149
150        newR2 = newR2 * newR2;
151        var impact = originalR2 - newR2;
152        impacts[inputVariable] = impact;
153      }
154
155      // calculate impacts for string variables
156      foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) {
157        if (factorReplacementMethod == FactorReplacementMethodEnum.Best) {
158          // try replacing with all possible values and find the best replacement value
159          var smallestImpact = double.PositiveInfinity;
160          foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) {
161            var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
162              Enumerable.Repeat(repl, dataset.Rows));
163            var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
164            if (error != OnlineCalculatorError.None)
165              throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
166
167            newR2 = newR2 * newR2;
168            var impact = originalR2 - newR2;
169            if (impact < smallestImpact) smallestImpact = impact;
170          }
171          impacts[inputVariable] = smallestImpact;
172        } else {
173          // for replacement methods shuffle and mode
174          // calculate impacts for factor variables
175
176          var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
177            factorReplacementMethod);
178          var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
179          if (error != OnlineCalculatorError.None)
180            throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
181
182          newR2 = newR2 * newR2;
183          var impact = originalR2 - newR2;
184          impacts[inputVariable] = impact;
185        }
186      } // foreach
187      return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value));
188    }
189
190
191    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable<int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) {
192      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
193      double replacementValue;
194      List<double> replacementValues;
195      IRandom rand;
196
197      switch (replacement) {
198        case ReplacementMethodEnum.Median:
199          replacementValue = rows.Select(r => originalValues[r]).Median();
200          replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
201          break;
202        case ReplacementMethodEnum.Average:
203          replacementValue = rows.Select(r => originalValues[r]).Average();
204          replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
205          break;
206        case ReplacementMethodEnum.Shuffle:
207          // new var has same empirical distribution but the relation to y is broken
208          rand = new FastRandom(31415);
209          // prepare a complete column for the dataset
210          replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
211          // shuffle only the selected rows
212          var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
213          int i = 0;
214          // update column values
215          foreach (var r in rows) {
216            replacementValues[r] = shuffledValues[i++];
217          }
218          break;
219        case ReplacementMethodEnum.Noise:
220          var avg = rows.Select(r => originalValues[r]).Average();
221          var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
222          rand = new FastRandom(31415);
223          // prepare a complete column for the dataset
224          replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
225          // update column values
226          foreach (var r in rows) {
227            replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev);
228          }
229          break;
230
231        default:
232          throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement));
233      }
234
235      return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
236    }
237
238    private static IEnumerable<double> EvaluateModelWithReplacedVariable(
239      IRegressionModel model, string variable, ModifiableDataset dataset,
240      IEnumerable<int> rows,
241      FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) {
242      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
243      List<string> replacementValues;
244      IRandom rand;
245
246      switch (replacement) {
247        case FactorReplacementMethodEnum.Mode:
248          var mostCommonValue = rows.Select(r => originalValues[r])
249            .GroupBy(v => v)
250            .OrderByDescending(g => g.Count())
251            .First().Key;
252          replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList();
253          break;
254        case FactorReplacementMethodEnum.Shuffle:
255          // new var has same empirical distribution but the relation to y is broken
256          rand = new FastRandom(31415);
257          // prepare a complete column for the dataset
258          replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList();
259          // shuffle only the selected rows
260          var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
261          int i = 0;
262          // update column values
263          foreach (var r in rows) {
264            replacementValues[r] = shuffledValues[i++];
265          }
266          break;
267        default:
268          throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement));
269      }
270
271      return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
272    }
273
274    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
275      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<double> replacementValues) {
276      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
277      dataset.ReplaceVariable(variable, replacementValues.ToList());
278      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
279      var estimates = model.GetEstimatedValues(dataset, rows).ToList();
280      dataset.ReplaceVariable(variable, originalValues);
281
282      return estimates;
283    }
284    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
285      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<string> replacementValues) {
286      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
287      dataset.ReplaceVariable(variable, replacementValues.ToList());
288      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
289      var estimates = model.GetEstimatedValues(dataset, rows).ToList();
290      dataset.ReplaceVariable(variable, originalValues);
291
292      return estimates;
293    }
294  }
295}
Note: See TracBrowser for help on using the repository browser.