source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionVariableImpactsCalculator.cs @ 13766

Last change on this file since 13766 was 13766, checked in by mkommend, 5 years ago

#2595: First version of impact calculation for regression solution.

File size: 7.1 KB
RevLine 
[13766]1#region License Information
2
3/* HeuristicLab
4 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
5 *
6 * This file is part of HeuristicLab.
7 *
8 * HeuristicLab is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * HeuristicLab is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22#endregion
23
24using System;
25using System.Collections.Generic;
26using System.Linq;
27using HeuristicLab.Common;
28using HeuristicLab.Core;
29using HeuristicLab.Data;
30using HeuristicLab.Parameters;
31using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
32
33namespace HeuristicLab.Problems.DataAnalysis {
34  [StorableClass]
35  [Item("RegressionSolution Impacts Calculator", "Calculation of the impacts of input variables for a concrete ")]
36  public sealed class RegressionSolutionVariableImpactsCalculator : ParameterizedNamedItem {
37    public enum ReplacementMethodEnum {
38      Median,
39      Average
40    }
41
42    public enum DataPartitionEnum {
43      Training,
44      Test,
45      All
46    }
47
48    private const string ReplacementParameterName = "Replacement Method";
49    private const string DataPartitionParameterName = "DataPartition";
50
51    public IFixedValueParameter<EnumValue<ReplacementMethodEnum>> ReplacementParameter {
52      get { return (IFixedValueParameter<EnumValue<ReplacementMethodEnum>>)Parameters[ReplacementParameterName]; }
53    }
54    public IFixedValueParameter<EnumValue<DataPartitionEnum>> DataPartitionParameter {
55      get { return (IFixedValueParameter<EnumValue<DataPartitionEnum>>)Parameters[DataPartitionParameterName]; }
56    }
57
58    public ReplacementMethodEnum ReplacementMethod {
59      get { return ReplacementParameter.Value.Value; }
60      set { ReplacementParameter.Value.Value = value; }
61    }
62    public DataPartitionEnum DataPartition {
63      get { return DataPartitionParameter.Value.Value; }
64      set { DataPartitionParameter.Value.Value = value; }
65    }
66
67
68    [StorableConstructor]
69    private RegressionSolutionVariableImpactsCalculator(bool deserializing) : base(deserializing) { }
70    private RegressionSolutionVariableImpactsCalculator(RegressionSolutionVariableImpactsCalculator original, Cloner cloner)
71      : base(original, cloner) { }
72    public override IDeepCloneable Clone(Cloner cloner) {
73      return new RegressionSolutionVariableImpactsCalculator(this, cloner);
74    }
75
76    public RegressionSolutionVariableImpactsCalculator()
77      : base() {
78      Parameters.Add(new FixedValueParameter<EnumValue<ReplacementMethodEnum>>(ReplacementParameterName, "The replacement method for variables during impact calculation.", new EnumValue<ReplacementMethodEnum>(ReplacementMethodEnum.Median)));
79      Parameters.Add(new FixedValueParameter<EnumValue<DataPartitionEnum>>(ReplacementParameterName, "The data partition on which the impacts are calculated.", new EnumValue<DataPartitionEnum>(DataPartitionEnum.Training)));
80    }
81
82    //mkommend: annoying name clash with static method, open to better naming suggestions
83    public IEnumerable<Tuple<string, double>> Calculate(IRegressionSolution solution) {
84      return CalculateImpacts(solution, DataPartition, ReplacementMethod);
85    }
86
87    public static IEnumerable<Tuple<string, double>> CalculateImpacts(IRegressionSolution solution,
88      DataPartitionEnum data = DataPartitionEnum.Training,
89      ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) {
90
91      var problemData = solution.ProblemData;
92      var dataset = problemData.Dataset;
93
94      IEnumerable<int> rows;
95      IEnumerable<double> targetValues;
96      double originalR2 = -1;
97
98      OnlineCalculatorError error;
99
100      switch (data) {
101        case DataPartitionEnum.All:
102          rows = solution.ProblemData.AllIndices;
103          targetValues = problemData.TargetVariableValues.ToList();
104          originalR2 = OnlinePearsonsRCalculator.Calculate(problemData.TargetVariableValues, solution.EstimatedValues, out error);
105          if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation.");
106          originalR2 = originalR2 * originalR2;
107          break;
108        case DataPartitionEnum.Training:
109          rows = problemData.TrainingIndices;
110          targetValues = problemData.TargetVariableTrainingValues.ToList();
111          originalR2 = solution.TrainingRSquared;
112          break;
113        case DataPartitionEnum.Test:
114          rows = problemData.TestIndices;
115          targetValues = problemData.TargetVariableTestValues.ToList();
116          originalR2 = solution.TestRSquared;
117          break;
118        default: throw new ArgumentException(string.Format("DataPartition {0} cannot be handled.", data));
119      }
120
121
122      var impacts = new Dictionary<string, double>();
123      var modifiableDataset = ((Dataset)dataset).ToModifiable();
124
125      foreach (var inputVariable in problemData.AllowedInputVariables) {
126        var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacement);
127        var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
128        if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
129
130        newR2 = newR2 * newR2;
131        var impact = originalR2 - newR2;
132        impacts[inputVariable] = impact;
133      }
134      return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value));
135    }
136
137    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable<int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) {
138      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
139      double replacementValue;
140
141      switch (replacement) {
142        case ReplacementMethodEnum.Median:
143          replacementValue = rows.Select(r => originalValues[r]).Median();
144          break;
145        case ReplacementMethodEnum.Average:
146          replacementValue = rows.Select(r => originalValues[r]).Average();
147          break;
148        default:
149          throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement));
150      }
151
152      dataset.ReplaceVariable(variable, Enumerable.Repeat(replacementValue, dataset.Rows).ToList());
153      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
154      var estimates = model.GetEstimatedValues(dataset, rows).ToList();
155      dataset.ReplaceVariable(variable, originalValues);
156
157      return estimates;
158    }
159  }
160}
Note: See TracBrowser for help on using the repository browser.