Changeset 14762


Ignore:
Timestamp:
03/18/17 12:47:30 (7 months ago)
Author:
gkronber
Message:

#2650: added option to specify replacement method for factor variables

Location:
branches/symbreg-factors-2650
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • branches/symbreg-factors-2650/HeuristicLab.Problems.DataAnalysis.Views/3.4/Regression/RegressionSolutionVariableImpactsView.Designer.cs

    r14351 r14762  
    4747      this.dataPartitionComboBox = new System.Windows.Forms.ComboBox();
    4848      this.dataPartitionLabel = new System.Windows.Forms.Label();
    49       this.replacementLabel = new System.Windows.Forms.Label();
     49      this.numericVarReplacementLabel = new System.Windows.Forms.Label();
    5050      this.replacementComboBox = new System.Windows.Forms.ComboBox();
     51      this.factorVarReplacementLabel = new System.Windows.Forms.Label();
     52      this.factorVarReplComboBox = new System.Windows.Forms.ComboBox();
    5153      this.SuspendLayout();
    5254      //
     
    5860      this.variableImactsArrayView.Caption = "StringConvertibleArray View";
    5961      this.variableImactsArrayView.Content = null;
    60       this.variableImactsArrayView.Location = new System.Drawing.Point(3, 59);
     62      this.variableImactsArrayView.Location = new System.Drawing.Point(3, 84);
    6163      this.variableImactsArrayView.Name = "variableImactsArrayView";
    6264      this.variableImactsArrayView.ReadOnly = true;
    63       this.variableImactsArrayView.Size = new System.Drawing.Size(304, 223);
    64       this.variableImactsArrayView.TabIndex = 4;
     65      this.variableImactsArrayView.Size = new System.Drawing.Size(363, 278);
     66      this.variableImactsArrayView.TabIndex = 2;
    6567      //
    6668      // dataPartitionComboBox
     
    7173            HeuristicLab.Problems.DataAnalysis.RegressionSolutionVariableImpactsCalculator.DataPartitionEnum.Test,
    7274            HeuristicLab.Problems.DataAnalysis.RegressionSolutionVariableImpactsCalculator.DataPartitionEnum.All});
    73       this.dataPartitionComboBox.Location = new System.Drawing.Point(82, 3);
     75      this.dataPartitionComboBox.Location = new System.Drawing.Point(197, 3);
    7476      this.dataPartitionComboBox.Name = "dataPartitionComboBox";
    7577      this.dataPartitionComboBox.Size = new System.Drawing.Size(121, 21);
     
    8688      this.dataPartitionLabel.Text = "Data partition:";
    8789      //
    88       // replacementLabel
     90      // numericVarReplacementLabel
    8991      //
    90       this.replacementLabel.AutoSize = true;
    91       this.replacementLabel.Location = new System.Drawing.Point(3, 35);
    92       this.replacementLabel.Name = "replacementLabel";
    93       this.replacementLabel.Size = new System.Drawing.Size(73, 13);
    94       this.replacementLabel.TabIndex = 2;
    95       this.replacementLabel.Text = "Replacement:";
     92      this.numericVarReplacementLabel.AutoSize = true;
     93      this.numericVarReplacementLabel.Location = new System.Drawing.Point(3, 33);
     94      this.numericVarReplacementLabel.Name = "numericVarReplacementLabel";
     95      this.numericVarReplacementLabel.Size = new System.Drawing.Size(173, 13);
     96      this.numericVarReplacementLabel.TabIndex = 2;
     97      this.numericVarReplacementLabel.Text = "Replacement for numeric variables:";
    9698      //
    9799      // replacementComboBox
     
    103105            HeuristicLab.Problems.DataAnalysis.RegressionSolutionVariableImpactsCalculator.ReplacementMethodEnum.Noise,
    104106            HeuristicLab.Problems.DataAnalysis.RegressionSolutionVariableImpactsCalculator.ReplacementMethodEnum.Shuffle});
    105       this.replacementComboBox.Location = new System.Drawing.Point(82, 32);
     107      this.replacementComboBox.Location = new System.Drawing.Point(197, 30);
    106108      this.replacementComboBox.Name = "replacementComboBox";
    107109      this.replacementComboBox.Size = new System.Drawing.Size(121, 21);
     
    109111      this.replacementComboBox.SelectedIndexChanged += new System.EventHandler(this.replacementComboBox_SelectedIndexChanged);
    110112      //
     113      // factorVarReplacementLabel
     114      //
     115      this.factorVarReplacementLabel.AutoSize = true;
     116      this.factorVarReplacementLabel.Location = new System.Drawing.Point(3, 60);
     117      this.factorVarReplacementLabel.Name = "factorVarReplacementLabel";
     118      this.factorVarReplacementLabel.Size = new System.Drawing.Size(188, 13);
     119      this.factorVarReplacementLabel.TabIndex = 0;
     120      this.factorVarReplacementLabel.Text = "Replacement for categorical variables:";
     121      //
     122      // factorVarReplComboBox
     123      //
     124      this.factorVarReplComboBox.FormattingEnabled = true;
     125      this.factorVarReplComboBox.Items.AddRange(new object[] {
     126            HeuristicLab.Problems.DataAnalysis.RegressionSolutionVariableImpactsCalculator.FactorReplacementMethodEnum.Best,
     127            HeuristicLab.Problems.DataAnalysis.RegressionSolutionVariableImpactsCalculator.FactorReplacementMethodEnum.Mode,
     128            HeuristicLab.Problems.DataAnalysis.RegressionSolutionVariableImpactsCalculator.FactorReplacementMethodEnum.Shuffle});
     129      this.factorVarReplComboBox.Location = new System.Drawing.Point(197, 57);
     130      this.factorVarReplComboBox.Name = "factorVarReplComboBox";
     131      this.factorVarReplComboBox.Size = new System.Drawing.Size(121, 21);
     132      this.factorVarReplComboBox.TabIndex = 1;
     133      this.factorVarReplComboBox.SelectedIndexChanged += new System.EventHandler(this.replacementComboBox_SelectedIndexChanged);
     134      //
    111135      // RegressionSolutionVariableImpactsView
    112136      //
    113137      this.AllowDrop = true;
    114138      this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Inherit;
     139      this.Controls.Add(this.factorVarReplComboBox);
     140      this.Controls.Add(this.factorVarReplacementLabel);
    115141      this.Controls.Add(this.replacementComboBox);
    116       this.Controls.Add(this.replacementLabel);
     142      this.Controls.Add(this.numericVarReplacementLabel);
    117143      this.Controls.Add(this.dataPartitionLabel);
    118144      this.Controls.Add(this.dataPartitionComboBox);
    119145      this.Controls.Add(this.variableImactsArrayView);
    120146      this.Name = "RegressionSolutionVariableImpactsView";
    121       this.Size = new System.Drawing.Size(310, 285);
     147      this.Size = new System.Drawing.Size(369, 365);
    122148      this.ResumeLayout(false);
    123149      this.PerformLayout();
     
    130156    private System.Windows.Forms.ComboBox dataPartitionComboBox;
    131157    private System.Windows.Forms.Label dataPartitionLabel;
    132     private System.Windows.Forms.Label replacementLabel;
     158    private System.Windows.Forms.Label numericVarReplacementLabel;
    133159    private System.Windows.Forms.ComboBox replacementComboBox;
     160    private System.Windows.Forms.Label factorVarReplacementLabel;
     161    private System.Windows.Forms.ComboBox factorVarReplComboBox;
    134162  }
    135163}
  • branches/symbreg-factors-2650/HeuristicLab.Problems.DataAnalysis.Views/3.4/Regression/RegressionSolutionVariableImpactsView.cs

    r14351 r14762  
    4343      this.dataPartitionComboBox.SelectedIndex = 0;
    4444      this.replacementComboBox.SelectedIndex = 0;
    45     }
     45      this.factorVarReplComboBox.SelectedIndex = 0;
     46      }
    4647
    4748    #region events
     
    6869    protected override void OnContentChanged() {
    6970      base.OnContentChanged();
    70       if (Content == null) {
     71      if(Content == null) {
    7172        variableImactsArrayView.Content = null;
    7273      } else {
     
    7677
    7778    private void UpdateVariableImpacts() {
    78       if (Content == null || replacementComboBox.SelectedIndex < 0 || dataPartitionComboBox.SelectedIndex < 0) return;
     79      if(Content == null || replacementComboBox.SelectedIndex < 0
     80        || factorVarReplComboBox.SelectedIndex < 0
     81        || dataPartitionComboBox.SelectedIndex < 0) return;
    7982      var mainForm = (MainForm.WindowsForms.MainForm)MainFormManager.MainForm;
    8083      variableImactsArrayView.Caption = Content.Name + " Variable Impacts";
    8184      var replMethod =
    82          (RegressionSolutionVariableImpactsCalculator.ReplacementMethodEnum)replacementComboBox.Items[replacementComboBox.SelectedIndex];
     85         (RegressionSolutionVariableImpactsCalculator.ReplacementMethodEnum)
     86           replacementComboBox.Items[replacementComboBox.SelectedIndex];
     87      var factorReplMethod =
     88        (RegressionSolutionVariableImpactsCalculator.FactorReplacementMethodEnum)
     89          factorVarReplComboBox.Items[factorVarReplComboBox.SelectedIndex];
    8390      var dataPartition =
    8491        (RegressionSolutionVariableImpactsCalculator.DataPartitionEnum)dataPartitionComboBox.SelectedItem;
     
    8895          mainForm.AddOperationProgressToView(this, "Calculating variable impacts for " + Content.Name);
    8996
    90           var impacts = RegressionSolutionVariableImpactsCalculator.CalculateImpacts(Content, dataPartition, replMethod);
     97          var impacts = RegressionSolutionVariableImpactsCalculator.CalculateImpacts(Content, dataPartition, replMethod, factorReplMethod);
    9198          var impactArray = new DoubleArray(impacts.Select(i => i.Item2).ToArray());
    9299          impactArray.ElementNames = impacts.Select(i => i.Item1);
  • branches/symbreg-factors-2650/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionVariableImpactsCalculator.cs

    r14498 r14762  
    4242      Noise
    4343    }
    44 
     44    public enum FactorReplacementMethodEnum {
     45      Best,
     46      Mode,
     47      Shuffle
     48    }
    4549    public enum DataPartitionEnum {
    4650      Training,
     
    8892    }
    8993
    90     public static IEnumerable<Tuple<string, double>> CalculateImpacts(IRegressionSolution solution,
     94    public static IEnumerable<Tuple<string, double>> CalculateImpacts(
     95      IRegressionSolution solution,
    9196      DataPartitionEnum data = DataPartitionEnum.Training,
    92       ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median) {
     97      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median,
     98      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) {
    9399
    94100      var problemData = solution.ProblemData;
     
    101107      OnlineCalculatorError error;
    102108
    103       switch (data) {
     109      switch(data) {
    104110        case DataPartitionEnum.All:
    105111          rows = solution.ProblemData.AllIndices;
    106112          targetValues = problemData.TargetVariableValues.ToList();
    107113          originalR2 = OnlinePearsonsRCalculator.Calculate(problemData.TargetVariableValues, solution.EstimatedValues, out error);
    108           if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation.");
     114          if(error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation.");
    109115          originalR2 = originalR2 * originalR2;
    110116          break;
     
    129135
    130136      // calculate impacts for double variables
    131       foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
     137      foreach(var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
    132138        var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);
    133139        var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
    134         if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     140        if(error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
    135141
    136142        newR2 = newR2 * newR2;
     
    138144        impacts[inputVariable] = impact;
    139145      }
    140       // calculate impacts for factor variables
    141       foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) {
    142         var smallestImpact = double.PositiveInfinity;
    143         foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) {
    144           var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, Enumerable.Repeat(repl, dataset.Rows));
     146
     147      // calculate impacts for string variables
     148      foreach(var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) {
     149        if(factorReplacementMethod == FactorReplacementMethodEnum.Best) {
     150          // try replacing with all possible values and find the best replacement value
     151          var smallestImpact = double.PositiveInfinity;
     152          foreach(var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) {
     153            var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     154              Enumerable.Repeat(repl, dataset.Rows));
     155            var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
     156            if(error != OnlineCalculatorError.None)
     157              throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     158
     159            newR2 = newR2 * newR2;
     160            var impact = originalR2 - newR2;
     161            if(impact < smallestImpact) smallestImpact = impact;
     162          }
     163          impacts[inputVariable] = smallestImpact;
     164        } else {
     165          // for replacement methods shuffle and mode
     166          // calculate impacts for factor variables
     167
     168          var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     169            factorReplacementMethod);
    145170          var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
    146           if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     171          if(error != OnlineCalculatorError.None)
     172            throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
    147173
    148174          newR2 = newR2 * newR2;
    149175          var impact = originalR2 - newR2;
    150           if (impact < smallestImpact) smallestImpact = impact;
     176          impacts[inputVariable] = impact;
    151177        }
    152         impacts[inputVariable] = smallestImpact;
    153       }
     178      } // foreach
    154179      return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value));
    155180    }
     
    161186      IRandom rand;
    162187
    163       switch (replacement) {
     188      switch(replacement) {
    164189        case ReplacementMethodEnum.Median:
    165190          replacementValue = rows.Select(r => originalValues[r]).Median();
     
    179204          int i = 0;
    180205          // update column values
    181           foreach (var r in rows) {
     206          foreach(var r in rows) {
    182207            replacementValues[r] = shuffledValues[i++];
    183208          }
     
    190215          replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
    191216          // update column values
    192           foreach (var r in rows) {
     217          foreach(var r in rows) {
    193218            replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev);
    194219          }
     
    202227    }
    203228
    204     private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
     229    private static IEnumerable<double> EvaluateModelWithReplacedVariable(
     230      IRegressionModel model, string variable, ModifiableDataset dataset,
     231      IEnumerable<int> rows,
     232      FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) {
     233      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
     234      List<string> replacementValues;
     235      IRandom rand;
     236
     237      switch(replacement) {
     238        case FactorReplacementMethodEnum.Mode:
     239          var mostCommonValue = rows.Select(r => originalValues[r])
     240            .GroupBy(v => v)
     241            .OrderByDescending(g => g.Count())
     242            .First().Key;
     243          replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList();
     244          break;
     245        case FactorReplacementMethodEnum.Shuffle:
     246          // new var has same empirical distribution but the relation to y is broken
     247          rand = new FastRandom(31415);
     248          // prepare a complete column for the dataset
     249          replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList();
     250          // shuffle only the selected rows
     251          var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
     252          int i = 0;
     253          // update column values
     254          foreach(var r in rows) {
     255            replacementValues[r] = shuffledValues[i++];
     256          }
     257          break;
     258        default:
     259          throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement));
     260      }
     261
     262      return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
     263    }
     264
     265    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
    205266      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<double> replacementValues) {
    206267      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
     
    212273      return estimates;
    213274    }
    214     private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, 
     275    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
    215276      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<string> replacementValues) {
    216277      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
Note: See TracChangeset for help on using the changeset viewer.