Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
03/18/17 12:47:30 (7 years ago)
Author:
gkronber
Message:

#2650: added option to specify replacement method for factor variables

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/symbreg-factors-2650/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionVariableImpactsCalculator.cs

    r14498 r14762  
    4242      Noise
    4343    }
    44 
     44    public enum FactorReplacementMethodEnum {
     45      Best,
     46      Mode,
     47      Shuffle
     48    }
    4549    public enum DataPartitionEnum {
    4650      Training,
     
    8892    }
    8993
    90     public static IEnumerable<Tuple<string, double>> CalculateImpacts(IRegressionSolution solution,
     94    public static IEnumerable<Tuple<string, double>> CalculateImpacts(
     95      IRegressionSolution solution,
    9196      DataPartitionEnum data = DataPartitionEnum.Training,
    92       ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median) {
     97      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median,
     98      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) {
    9399
    94100      var problemData = solution.ProblemData;
     
    101107      OnlineCalculatorError error;
    102108
    103       switch (data) {
     109      switch(data) {
    104110        case DataPartitionEnum.All:
    105111          rows = solution.ProblemData.AllIndices;
    106112          targetValues = problemData.TargetVariableValues.ToList();
    107113          originalR2 = OnlinePearsonsRCalculator.Calculate(problemData.TargetVariableValues, solution.EstimatedValues, out error);
    108           if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation.");
     114          if(error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation.");
    109115          originalR2 = originalR2 * originalR2;
    110116          break;
     
    129135
    130136      // calculate impacts for double variables
    131       foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
     137      foreach(var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
    132138        var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);
    133139        var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
    134         if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     140        if(error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
    135141
    136142        newR2 = newR2 * newR2;
     
    138144        impacts[inputVariable] = impact;
    139145      }
    140       // calculate impacts for factor variables
    141       foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) {
    142         var smallestImpact = double.PositiveInfinity;
    143         foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) {
    144           var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, Enumerable.Repeat(repl, dataset.Rows));
     146
     147      // calculate impacts for string variables
     148      foreach(var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) {
     149        if(factorReplacementMethod == FactorReplacementMethodEnum.Best) {
     150          // try replacing with all possible values and find the best replacement value
     151          var smallestImpact = double.PositiveInfinity;
     152          foreach(var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) {
     153            var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     154              Enumerable.Repeat(repl, dataset.Rows));
     155            var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
     156            if(error != OnlineCalculatorError.None)
     157              throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     158
     159            newR2 = newR2 * newR2;
     160            var impact = originalR2 - newR2;
     161            if(impact < smallestImpact) smallestImpact = impact;
     162          }
     163          impacts[inputVariable] = smallestImpact;
     164        } else {
     165          // for replacement methods shuffle and mode
     166          // calculate impacts for factor variables
     167
     168          var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     169            factorReplacementMethod);
    145170          var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
    146           if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     171          if(error != OnlineCalculatorError.None)
     172            throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
    147173
    148174          newR2 = newR2 * newR2;
    149175          var impact = originalR2 - newR2;
    150           if (impact < smallestImpact) smallestImpact = impact;
     176          impacts[inputVariable] = impact;
    151177        }
    152         impacts[inputVariable] = smallestImpact;
    153       }
     178      } // foreach
    154179      return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value));
    155180    }
     
    161186      IRandom rand;
    162187
    163       switch (replacement) {
     188      switch(replacement) {
    164189        case ReplacementMethodEnum.Median:
    165190          replacementValue = rows.Select(r => originalValues[r]).Median();
     
    179204          int i = 0;
    180205          // update column values
    181           foreach (var r in rows) {
     206          foreach(var r in rows) {
    182207            replacementValues[r] = shuffledValues[i++];
    183208          }
     
    190215          replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
    191216          // update column values
    192           foreach (var r in rows) {
     217          foreach(var r in rows) {
    193218            replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev);
    194219          }
     
    202227    }
    203228
    204     private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
     229    private static IEnumerable<double> EvaluateModelWithReplacedVariable(
     230      IRegressionModel model, string variable, ModifiableDataset dataset,
     231      IEnumerable<int> rows,
     232      FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) {
     233      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
     234      List<string> replacementValues;
     235      IRandom rand;
     236
     237      switch(replacement) {
     238        case FactorReplacementMethodEnum.Mode:
     239          var mostCommonValue = rows.Select(r => originalValues[r])
     240            .GroupBy(v => v)
     241            .OrderByDescending(g => g.Count())
     242            .First().Key;
     243          replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList();
     244          break;
     245        case FactorReplacementMethodEnum.Shuffle:
     246          // new var has same empirical distribution but the relation to y is broken
     247          rand = new FastRandom(31415);
     248          // prepare a complete column for the dataset
     249          replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList();
     250          // shuffle only the selected rows
     251          var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
     252          int i = 0;
     253          // update column values
     254          foreach(var r in rows) {
     255            replacementValues[r] = shuffledValues[i++];
     256          }
     257          break;
     258        default:
     259          throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement));
     260      }
     261
     262      return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
     263    }
     264
     265    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
    205266      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<double> replacementValues) {
    206267      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
     
    212273      return estimates;
    213274    }
    214     private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, 
     275    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
    215276      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<string> replacementValues) {
    216277      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
Note: See TracChangeset for help on using the changeset viewer.