Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
12/21/18 14:20:24 (6 years ago)
Author:
mkommend
Message:

#2904: Merged r16422 and r16423 into stable.

Location:
stable
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • stable

  • stable/HeuristicLab.Problems.DataAnalysis

  • stable/HeuristicLab.Problems.DataAnalysis/3.4

    • Property svn:mergeinfo set to (toggle deleted branches)
      /trunk/HeuristicLab.Problems.DataAnalysis/3.4mergedeligible
      /branches/2904_CalculateImpacts/3.415808-16421
      /branches/Async/HeuristicLab.Problems.DataAnalysis/3.413329-15286
      /branches/Classification-Extensions/HeuristicLab.Problems.DataAnalysis/3.411606-11761
      /branches/ClassificationModelComparison/HeuristicLab.Problems.DataAnalysis/3.49073-13099
      /branches/CloningRefactoring/HeuristicLab.Problems.DataAnalysis/3.44656-4721
      /branches/DataAnalysis Refactoring/HeuristicLab.Problems.DataAnalysis/3.45471-5808
      /branches/DataAnalysis SolutionEnsembles/HeuristicLab.Problems.DataAnalysis/3.45815-6180
      /branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis/3.44220,​4226,​4236-4238,​4389,​4458-4459,​4462,​4464
      /branches/DataAnalysisCSVImport/HeuristicLab.Problems.DataAnalysis/3.48713-8875
      /branches/DataPreprocessing/HeuristicLab.Problems.DataAnalysis/3.410085-11101
      /branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis/3.48035-8538
      /branches/GP.Grammar.Editor/HeuristicLab.Problems.DataAnalysis/3.46284-6795
      /branches/GP.Symbols (TimeLag, Diff, Integral)/HeuristicLab.Problems.DataAnalysis/3.45060
      /branches/HeuristicLab.DatasetRefactor/sources/HeuristicLab.Problems.DataAnalysis/3.411570-12508
      /branches/HeuristicLab.Problems.Orienteering/HeuristicLab.Problems.DataAnalysis/3.411130-12721
      /branches/HeuristicLab.RegressionSolutionGradientView/HeuristicLab.Problems.DataAnalysis/3.413819-14091
      /branches/HeuristicLab.TimeSeries/HeuristicLab.Problems.DataAnalysis/3.47098-8789
      /branches/LogResidualEvaluator/HeuristicLab.Problems.DataAnalysis/3.410202-10483
      /branches/NET40/sources/HeuristicLab.Problems.DataAnalysis/3.45138-5162
      /branches/ParallelEngine/HeuristicLab.Problems.DataAnalysis/3.45175-5192
      /branches/ProblemInstancesRegressionAndClassification/HeuristicLab.Problems.DataAnalysis/3.47570-7810
      /branches/QAPAlgorithms/HeuristicLab.Problems.DataAnalysis/3.46350-6627
      /branches/Restructure trunk solution/HeuristicLab.Problems.DataAnalysis/3.46828
      /branches/SimplifierViewsProgress/HeuristicLab.Problems.DataAnalysis/3.415318-15370
      /branches/SpectralKernelForGaussianProcesses/HeuristicLab.Problems.DataAnalysis/3.410204-10479
      /branches/Trunk/HeuristicLab.Problems.DataAnalysis/3.46829-6865
      /branches/histogram/HeuristicLab.Problems.DataAnalysis/3.45959-6341
      /branches/symbreg-factors-2650/HeuristicLab.Problems.DataAnalysis/3.414232-14825
      /trunk/sources/HeuristicLab.Problems.DataAnalysis/3.49498,​9552,​9762,​9973-9975,​9994,​10406,​10480,​10484,​10486,​10540-10541,​10543,​10545,​11031,​11114,​11116,​11156,​11330,​11332,​11417,​11422,​11623,​11631,​11634,​11762-11764,​11766,​12067,​12485,​12492,​12504,​12506,​12509-12512,​12524,​12578,​12581,​12612,​12622,​12641,​12649,​12664,​12722,​12770,​12772,​12790-12792,​12796,​12798,​12801,​12811-12812,​12816-12817,​12836-12837,​12851,​12907,​12971,​13001,​13027,​13038,​13040,​13100-13104,​13154,​13268,​13395,​13406,​13419,​13427-13430,​13434,​13440-13442,​13445-13447,​13450,​13474,​13501,​13503,​13511,​13513,​13525-13526,​13529,​13534-13535,​13539-13540,​13550,​13552,​13584-13585,​13593,​13645,​13648,​13650-13652,​13654,​13657-13659,​13661-13662,​13666,​13669,​13682-13684,​13690-13693,​13697-13698,​13700-13702,​13704-13705,​13708-13709,​13711,​13715,​13724,​13746,​13760-13761,​13766,​13785-13786,​13801,​13826,​13901,​13921-13922,​13925,​13938,​13941-13942,​13985-13987,​13992-13993,​14000-14001,​14015-14016,​14095-14096,​14098-14099,​14107,​14118-14119,​14131,​14135,​14140,​14142,​14157-14158,​14160,​14226,​14228-14230,​14234-14236,​14244-14247,​14250,​14255-14258,​14260,​14267,​14271-14272,​14282,​14284-14298,​14300,​14307,​14314-14316,​14319,​14322,​14332,​14343-14350,​14358,​14367-14368,​14372,​14376,​14378,​14381-14382,​14384,​14388,​14390-14391,​14393-14394,​14396,​14400,​14405,​14407-14408,​14418,​14422-14423,​14425,​14434,​14463-14465,​14468-14469,​14479,​14483,​14486,​14507,​14517,​14523,​14527,​14529,​14531-14533,​14553,​14623,​14630,​14781,​14789-14791,​14805,​14826-14827,​14829-14832,​14839-14840,​14843,​14845-14847,​14851-14854,​14857,​14864-14865,​14871,​14889-14890,​14899,​14904,​14918,​14938,​14940,​14943-14946,​14948-14951,​15002,​15013,​15015-15016,​15023-15024,​15026,​15046,​15052-15054,​15058,​15077,​15085,​15088,​15094,​15103-15106,​15111-15113,​15122-15124,​15129,​15139,​15160,​15163,​15165,​15184-15185,​15187,​15194,​15287,​15371-15372,​15390,​15396,​15400,​15402,​15427,​15498,​15517
  • stable/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionVariableImpactsCalculator.cs

    r16435 r16438  
    2323
    2424using System;
     25using System.Collections;
    2526using System.Collections.Generic;
    2627using System.Linq;
     
    3637  [Item("RegressionSolution Impacts Calculator", "Calculation of the impacts of input variables for any regression solution")]
    3738  public sealed class RegressionSolutionVariableImpactsCalculator : ParameterizedNamedItem {
     39    #region Parameters/Properties
    3840    public enum ReplacementMethodEnum {
    3941      Median,
     
    5456
    5557    private const string ReplacementParameterName = "Replacement Method";
     58    private const string FactorReplacementParameterName = "Factor Replacement Method";
    5659    private const string DataPartitionParameterName = "DataPartition";
    5760
    5861    public IFixedValueParameter<EnumValue<ReplacementMethodEnum>> ReplacementParameter {
    5962      get { return (IFixedValueParameter<EnumValue<ReplacementMethodEnum>>)Parameters[ReplacementParameterName]; }
     63    }
     64    public IFixedValueParameter<EnumValue<FactorReplacementMethodEnum>> FactorReplacementParameter {
     65      get { return (IFixedValueParameter<EnumValue<FactorReplacementMethodEnum>>)Parameters[FactorReplacementParameterName]; }
    6066    }
    6167    public IFixedValueParameter<EnumValue<DataPartitionEnum>> DataPartitionParameter {
     
    6773      set { ReplacementParameter.Value.Value = value; }
    6874    }
     75    public FactorReplacementMethodEnum FactorReplacementMethod {
     76      get { return FactorReplacementParameter.Value.Value; }
     77      set { FactorReplacementParameter.Value.Value = value; }
     78    }
    6979    public DataPartitionEnum DataPartition {
    7080      get { return DataPartitionParameter.Value.Value; }
    7181      set { DataPartitionParameter.Value.Value = value; }
    7282    }
    73 
    74 
     83    #endregion
     84
     85    #region Ctor/Cloner
    7586    [StorableConstructor]
    7687    private RegressionSolutionVariableImpactsCalculator(bool deserializing) : base(deserializing) { }
    7788    private RegressionSolutionVariableImpactsCalculator(RegressionSolutionVariableImpactsCalculator original, Cloner cloner)
    7889      : base(original, cloner) { }
     90    public RegressionSolutionVariableImpactsCalculator()
     91      : base() {
     92      Parameters.Add(new FixedValueParameter<EnumValue<ReplacementMethodEnum>>(ReplacementParameterName, "The replacement method for variables during impact calculation.", new EnumValue<ReplacementMethodEnum>(ReplacementMethodEnum.Shuffle)));
     93      Parameters.Add(new FixedValueParameter<EnumValue<FactorReplacementMethodEnum>>(FactorReplacementParameterName, "The replacement method for factor variables during impact calculation.", new EnumValue<FactorReplacementMethodEnum>(FactorReplacementMethodEnum.Best)));
     94      Parameters.Add(new FixedValueParameter<EnumValue<DataPartitionEnum>>(DataPartitionParameterName, "The data partition on which the impacts are calculated.", new EnumValue<DataPartitionEnum>(DataPartitionEnum.Training)));
     95    }
     96
    7997    public override IDeepCloneable Clone(Cloner cloner) {
    8098      return new RegressionSolutionVariableImpactsCalculator(this, cloner);
    8199    }
    82 
    83     public RegressionSolutionVariableImpactsCalculator()
    84       : base() {
    85       Parameters.Add(new FixedValueParameter<EnumValue<ReplacementMethodEnum>>(ReplacementParameterName, "The replacement method for variables during impact calculation.", new EnumValue<ReplacementMethodEnum>(ReplacementMethodEnum.Median)));
    86       Parameters.Add(new FixedValueParameter<EnumValue<DataPartitionEnum>>(DataPartitionParameterName, "The data partition on which the impacts are calculated.", new EnumValue<DataPartitionEnum>(DataPartitionEnum.Training)));
    87     }
     100    #endregion
    88101
    89102    //mkommend: annoying name clash with static method, open to better naming suggestions
    90103    public IEnumerable<Tuple<string, double>> Calculate(IRegressionSolution solution) {
    91       return CalculateImpacts(solution, DataPartition, ReplacementMethod);
     104      return CalculateImpacts(solution, ReplacementMethod, FactorReplacementMethod, DataPartition);
    92105    }
    93106
    94107    public static IEnumerable<Tuple<string, double>> CalculateImpacts(
    95108      IRegressionSolution solution,
    96       DataPartitionEnum data = DataPartitionEnum.Training,
    97       ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median,
     109      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle,
    98110      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
    99       Func<double, string, bool> progressCallback = null) {
    100 
    101       var problemData = solution.ProblemData;
    102       var dataset = problemData.Dataset;
    103 
    104       IEnumerable<int> rows;
    105       IEnumerable<double> targetValues;
    106       double originalR2 = -1;
    107 
    108       OnlineCalculatorError error;
    109 
    110       switch (data) {
    111         case DataPartitionEnum.All:
    112           rows = solution.ProblemData.AllIndices;
    113           targetValues = problemData.TargetVariableValues.ToList();
    114           originalR2 = OnlinePearsonsRCalculator.Calculate(problemData.TargetVariableValues, solution.EstimatedValues, out error);
    115           if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation.");
    116           originalR2 = originalR2 * originalR2;
    117           break;
    118         case DataPartitionEnum.Training:
    119           rows = problemData.TrainingIndices;
    120           targetValues = problemData.TargetVariableTrainingValues.ToList();
    121           originalR2 = solution.TrainingRSquared;
    122           break;
    123         case DataPartitionEnum.Test:
    124           rows = problemData.TestIndices;
    125           targetValues = problemData.TargetVariableTestValues.ToList();
    126           originalR2 = solution.TestRSquared;
    127           break;
    128         default: throw new ArgumentException(string.Format("DataPartition {0} cannot be handled.", data));
    129       }
     111      DataPartitionEnum dataPartition = DataPartitionEnum.Training) {
     112
     113      IEnumerable<int> rows = GetPartitionRows(dataPartition, solution.ProblemData);
     114      IEnumerable<double> estimatedValues = solution.GetEstimatedValues(rows);
     115      return CalculateImpacts(solution.Model, solution.ProblemData, estimatedValues, rows, replacementMethod, factorReplacementMethod);
     116    }
     117
     118    public static IEnumerable<Tuple<string, double>> CalculateImpacts(
     119     IRegressionModel model,
     120     IRegressionProblemData problemData,
     121     IEnumerable<double> estimatedValues,
     122     IEnumerable<int> rows,
     123     ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle,
     124     FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) {
     125
     126      //fholzing: try and catch in case a different dataset is loaded, otherwise statement is neglectable
     127      var missingVariables = model.VariablesUsedForPrediction.Except(problemData.Dataset.VariableNames);
     128      if (missingVariables.Any()) {
     129        throw new InvalidOperationException(string.Format("Can not calculate variable impacts, because the model uses inputs missing in the dataset ({0})", string.Join(", ", missingVariables)));
     130      }
     131      IEnumerable<double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
     132      var originalQuality = CalculateQuality(targetValues, estimatedValues);
    130133
    131134      var impacts = new Dictionary<string, double>();
    132       var modifiableDataset = ((Dataset)dataset).ToModifiable();
    133 
    134       var inputvariables = new HashSet<string>(problemData.AllowedInputVariables.Union(solution.Model.VariablesUsedForPrediction));
    135       var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList();
    136 
    137       int curIdx = 0;
    138       int count = allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>).Count();
    139       // calculate impacts for double variables
    140       foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
    141         //Report the current progress in percent. If the callback returns true, it means the execution shall be stopped
    142         if (progressCallback != null) {
    143           curIdx++;
    144           if (progressCallback((double)curIdx / count, string.Format("Calculating impact for variable {0} ({1} of {2})", inputVariable, curIdx, count))) { return null; }
    145         }
    146         var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);
    147         var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
    148         if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
    149 
    150         newR2 = newR2 * newR2;
    151         var impact = originalR2 - newR2;
    152         impacts[inputVariable] = impact;
    153       }
    154 
    155       // calculate impacts for string variables
    156       foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) {
    157         if (factorReplacementMethod == FactorReplacementMethodEnum.Best) {
    158           // try replacing with all possible values and find the best replacement value
    159           var smallestImpact = double.PositiveInfinity;
    160           foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) {
    161             var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
    162               Enumerable.Repeat(repl, dataset.Rows));
    163             var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
    164             if (error != OnlineCalculatorError.None)
    165               throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
    166 
    167             newR2 = newR2 * newR2;
    168             var impact = originalR2 - newR2;
    169             if (impact < smallestImpact) smallestImpact = impact;
    170           }
    171           impacts[inputVariable] = smallestImpact;
    172         } else {
    173           // for replacement methods shuffle and mode
    174           // calculate impacts for factor variables
    175 
    176           var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
    177             factorReplacementMethod);
    178           var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
    179           if (error != OnlineCalculatorError.None)
    180             throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
    181 
    182           newR2 = newR2 * newR2;
    183           var impact = originalR2 - newR2;
    184           impacts[inputVariable] = impact;
    185         }
    186       } // foreach
    187       return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value));
    188     }
    189 
    190 
    191     private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable<int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) {
    192       var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
     135      var inputvariables = new HashSet<string>(problemData.AllowedInputVariables.Union(model.VariablesUsedForPrediction));
     136      var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable();
     137
     138      foreach (var inputVariable in inputvariables) {
     139        impacts[inputVariable] = CalculateImpact(inputVariable, model, problemData, modifiableDataset, rows, replacementMethod, factorReplacementMethod, targetValues, originalQuality);
     140      }
     141
     142      return impacts.Select(i => Tuple.Create(i.Key, i.Value));
     143    }
     144
     145    public static double CalculateImpact(string variableName,
     146      IRegressionModel model,
     147      IRegressionProblemData problemData,
     148      ModifiableDataset modifiableDataset,
     149      IEnumerable<int> rows,
     150      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle,
     151      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
     152      IEnumerable<double> targetValues = null,
     153      double quality = double.NaN) {
     154
     155      if (!model.VariablesUsedForPrediction.Contains(variableName)) { return 0.0; }
     156      if (!problemData.Dataset.VariableNames.Contains(variableName)) {
     157        throw new InvalidOperationException(string.Format("Can not calculate variable impact, because the model uses inputs missing in the dataset ({0})", variableName));
     158      }
     159
     160      if (targetValues == null) {
     161        targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
     162      }
     163      if (quality == double.NaN) {
     164        quality = CalculateQuality(model.GetEstimatedValues(modifiableDataset, rows), targetValues);
     165      }
     166
     167      IList originalValues = null;
     168      IList replacementValues = GetReplacementValues(modifiableDataset, variableName, model, rows, targetValues, out originalValues, replacementMethod, factorReplacementMethod);
     169
     170      double newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, replacementValues, targetValues);
     171      double impact = quality - newValue;
     172
     173      return impact;
     174    }
     175
     176    private static IList GetReplacementValues(ModifiableDataset modifiableDataset,
     177      string variableName,
     178      IRegressionModel model,
     179      IEnumerable<int> rows,
     180      IEnumerable<double> targetValues,
     181      out IList originalValues,
     182      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle,
     183      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) {
     184
     185      IList replacementValues = null;
     186      if (modifiableDataset.VariableHasType<double>(variableName)) {
     187        originalValues = modifiableDataset.GetReadOnlyDoubleValues(variableName).ToList();
     188        replacementValues = GetReplacementValuesForDouble(modifiableDataset, rows, (List<double>)originalValues, replacementMethod);
     189      } else if (modifiableDataset.VariableHasType<string>(variableName)) {
     190        originalValues = modifiableDataset.GetReadOnlyStringValues(variableName).ToList();
     191        replacementValues = GetReplacementValuesForString(model, modifiableDataset, variableName, rows, (List<string>)originalValues, targetValues, factorReplacementMethod);
     192      } else {
     193        throw new NotSupportedException("Variable not supported");
     194      }
     195
     196      return replacementValues;
     197    }
     198
     199    private static IList GetReplacementValuesForDouble(ModifiableDataset modifiableDataset,
     200      IEnumerable<int> rows,
     201      List<double> originalValues,
     202      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle) {
     203
     204      IRandom random = new FastRandom(31415);
     205      List<double> replacementValues;
    193206      double replacementValue;
    194       List<double> replacementValues;
    195       IRandom rand;
    196 
    197       switch (replacement) {
     207
     208      switch (replacementMethod) {
    198209        case ReplacementMethodEnum.Median:
    199210          replacementValue = rows.Select(r => originalValues[r]).Median();
    200           replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
     211          replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();
    201212          break;
    202213        case ReplacementMethodEnum.Average:
    203214          replacementValue = rows.Select(r => originalValues[r]).Average();
    204           replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
     215          replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();
    205216          break;
    206217        case ReplacementMethodEnum.Shuffle:
    207218          // new var has same empirical distribution but the relation to y is broken
    208           rand = new FastRandom(31415);
    209219          // prepare a complete column for the dataset
    210           replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
     220          replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();
    211221          // shuffle only the selected rows
    212           var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
     222          var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList();
    213223          int i = 0;
    214224          // update column values
     
    220230          var avg = rows.Select(r => originalValues[r]).Average();
    221231          var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
    222           rand = new FastRandom(31415);
    223232          // prepare a complete column for the dataset
    224           replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
     233          replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();
    225234          // update column values
    226235          foreach (var r in rows) {
    227             replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev);
     236            replacementValues[r] = NormalDistributedRandom.NextDouble(random, avg, stdDev);
    228237          }
    229238          break;
    230239
    231240        default:
    232           throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement));
    233       }
    234 
    235       return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
    236     }
    237 
    238     private static IEnumerable<double> EvaluateModelWithReplacedVariable(
    239       IRegressionModel model, string variable, ModifiableDataset dataset,
    240       IEnumerable<int> rows,
    241       FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) {
    242       var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
    243       List<string> replacementValues;
    244       IRandom rand;
    245 
    246       switch (replacement) {
     241          throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacementMethod));
     242      }
     243
     244      return replacementValues;
     245    }
     246
     247    private static IList GetReplacementValuesForString(IRegressionModel model,
     248      ModifiableDataset modifiableDataset,
     249      string variableName,
     250      IEnumerable<int> rows,
     251      List<string> originalValues,
     252      IEnumerable<double> targetValues,
     253      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Shuffle) {
     254
     255      List<string> replacementValues = null;
     256      IRandom random = new FastRandom(31415);
     257
     258      switch (factorReplacementMethod) {
     259        case FactorReplacementMethodEnum.Best:
     260          // try replacing with all possible values and find the best replacement value
     261          var bestQuality = double.NegativeInfinity;
     262          foreach (var repl in modifiableDataset.GetStringValues(variableName, rows).Distinct()) {
     263            List<string> curReplacementValues = Enumerable.Repeat(repl, modifiableDataset.Rows).ToList();
     264            //fholzing: this result could be used later on (theoretically), but is neglected for better readability/method consistency
     265            var newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, curReplacementValues, targetValues);
     266            var curQuality = newValue;
     267
     268            if (curQuality > bestQuality) {
     269              bestQuality = curQuality;
     270              replacementValues = curReplacementValues;
     271            }
     272          }
     273          break;
    247274        case FactorReplacementMethodEnum.Mode:
    248275          var mostCommonValue = rows.Select(r => originalValues[r])
     
    250277            .OrderByDescending(g => g.Count())
    251278            .First().Key;
    252           replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList();
     279          replacementValues = Enumerable.Repeat(mostCommonValue, modifiableDataset.Rows).ToList();
    253280          break;
    254281        case FactorReplacementMethodEnum.Shuffle:
    255282          // new var has same empirical distribution but the relation to y is broken
    256           rand = new FastRandom(31415);
    257283          // prepare a complete column for the dataset
    258           replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList();
     284          replacementValues = Enumerable.Repeat(string.Empty, modifiableDataset.Rows).ToList();
    259285          // shuffle only the selected rows
    260           var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
     286          var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList();
    261287          int i = 0;
    262288          // update column values
     
    266292          break;
    267293        default:
    268           throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement));
    269       }
    270 
    271       return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
    272     }
    273 
    274     private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
    275       ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<double> replacementValues) {
    276       var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
    277       dataset.ReplaceVariable(variable, replacementValues.ToList());
     294          throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", factorReplacementMethod));
     295      }
     296
     297      return replacementValues;
     298    }
     299
     300    private static double CalculateQualityForReplacement(
     301      IRegressionModel model,
     302      ModifiableDataset modifiableDataset,
     303      string variableName,
     304      IList originalValues,
     305      IEnumerable<int> rows,
     306      IList replacementValues,
     307      IEnumerable<double> targetValues) {
     308
     309      modifiableDataset.ReplaceVariable(variableName, replacementValues);
    278310      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
    279       var estimates = model.GetEstimatedValues(dataset, rows).ToList();
    280       dataset.ReplaceVariable(variable, originalValues);
    281 
    282       return estimates;
    283     }
    284     private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
    285       ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<string> replacementValues) {
    286       var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
    287       dataset.ReplaceVariable(variable, replacementValues.ToList());
    288       //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
    289       var estimates = model.GetEstimatedValues(dataset, rows).ToList();
    290       dataset.ReplaceVariable(variable, originalValues);
    291 
    292       return estimates;
     311      var estimates = model.GetEstimatedValues(modifiableDataset, rows).ToList();
     312      var ret = CalculateQuality(targetValues, estimates);
     313      modifiableDataset.ReplaceVariable(variableName, originalValues);
     314
     315      return ret;
     316    }
     317
     318    public static double CalculateQuality(IEnumerable<double> targetValues, IEnumerable<double> estimatedValues) {
     319      OnlineCalculatorError errorState;
     320      var ret = OnlinePearsonsRCalculator.Calculate(targetValues, estimatedValues, out errorState);
     321      if (errorState != OnlineCalculatorError.None) { throw new InvalidOperationException("Error during calculation with replaced inputs."); }
     322      return ret * ret;
     323    }
     324
     325    public static IEnumerable<int> GetPartitionRows(DataPartitionEnum dataPartition, IRegressionProblemData problemData) {
     326      IEnumerable<int> rows;
     327
     328      switch (dataPartition) {
     329        case DataPartitionEnum.All:
     330          rows = problemData.AllIndices;
     331          break;
     332        case DataPartitionEnum.Test:
     333          rows = problemData.TestIndices;
     334          break;
     335        case DataPartitionEnum.Training:
     336          rows = problemData.TrainingIndices;
     337          break;
     338        default:
     339          throw new NotSupportedException("DataPartition not supported");
     340      }
     341
     342      return rows;
    293343    }
    294344  }
Note: See TracChangeset for help on using the changeset viewer.