Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
04/04/17 17:52:44 (8 years ago)
Author:
gkronber
Message:

#2650: merged the factors branch into trunk

Location:
trunk/sources
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources

  • trunk/sources/HeuristicLab.Problems.DataAnalysis

  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Dataset.cs

    r13761 r14826  
    168168      get { return variableValues.Where(p => p.Value is List<double>).Select(p => p.Key); }
    169169    }
     170
     171    public IEnumerable<string> StringVariables {
     172      get { return variableValues.Where(p => p.Value is List<string>).Select(p => p.Key); }
     173    }
     174
    170175    public IEnumerable<double> GetDoubleValues(string variableName) {
    171176      return GetValues<double>(variableName);
     
    189194      return GetValues<double>(variableName, rows);
    190195    }
     196
     197    public string GetStringValue(string variableName, int row) {
     198      var values = GetValues<string>(variableName);
     199      return values[row];
     200    }
     201
     202    public IEnumerable<string> GetStringValues(string variableName, IEnumerable<int> rows) {
     203      return GetValues<string>(variableName, rows);
     204    }
     205    public ReadOnlyCollection<string> GetReadOnlyStringValues(string variableName) {
     206      var values = GetValues<string>(variableName);
     207      return values.AsReadOnly();
     208    }
     209
    191210    private IEnumerable<T> GetValues<T>(string variableName, IEnumerable<int> rows) {
    192211      var values = GetValues<T>(variableName);
  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/DataAnalysisProblemData.cs

    r14507 r14826  
    152152    protected DataAnalysisProblemData(IDataset dataset, IEnumerable<string> allowedInputVariables, IEnumerable<ITransformation> transformations = null) {
    153153      if (dataset == null) throw new ArgumentNullException("The dataset must not be null.");
    154       if (allowedInputVariables == null) throw new ArgumentNullException("The allowedInputVariables must not be null.");
    155 
    156       if (allowedInputVariables.Except(dataset.DoubleVariables).Any())
    157         throw new ArgumentException("All allowed input variables must be present in the dataset and of type double.");
    158 
    159       var inputVariables = new CheckedItemList<StringValue>(dataset.DoubleVariables.Select(x => new StringValue(x)));
     154      if (allowedInputVariables == null) throw new ArgumentNullException("The allowed input variables must not be null.");
     155
     156      if (allowedInputVariables.Except(dataset.DoubleVariables).Except(dataset.StringVariables).Any())
     157        throw new ArgumentException("All allowed input variables must be present in the dataset and of type double or string.");
     158
     159      var variables = dataset.VariableNames.Where(variable => dataset.VariableHasType<double>(variable) || dataset.VariableHasType<string>(variable));
     160      var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x)));
    160161      foreach (StringValue x in inputVariables)
    161162        inputVariables.SetItemCheckedState(x, allowedInputVariables.Contains(x.Value));
  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionVariableImpactsCalculator.cs

    r14463 r14826  
    4242      Noise
    4343    }
    44 
     44    public enum FactorReplacementMethodEnum {
     45      Best,
     46      Mode,
     47      Shuffle
     48    }
    4549    public enum DataPartitionEnum {
    4650      Training,
     
    8892    }
    8993
    90     public static IEnumerable<Tuple<string, double>> CalculateImpacts(IRegressionSolution solution,
     94    public static IEnumerable<Tuple<string, double>> CalculateImpacts(
     95      IRegressionSolution solution,
    9196      DataPartitionEnum data = DataPartitionEnum.Training,
    92       ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) {
     97      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median,
     98      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) {
    9399
    94100      var problemData = solution.ProblemData;
     
    128134      var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList();
    129135
    130       foreach (var inputVariable in allowedInputVariables) {
    131         var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacement);
     136      // calculate impacts for double variables
     137      foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
     138        var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);
    132139        var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
    133140        if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     
    137144        impacts[inputVariable] = impact;
    138145      }
     146
     147      // calculate impacts for string variables
     148      foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) {
     149        if (factorReplacementMethod == FactorReplacementMethodEnum.Best) {
     150          // try replacing with all possible values and find the best replacement value
     151          var smallestImpact = double.PositiveInfinity;
     152          foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) {
     153            var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     154              Enumerable.Repeat(repl, dataset.Rows));
     155            var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
     156            if (error != OnlineCalculatorError.None)
     157              throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     158
     159            newR2 = newR2 * newR2;
     160            var impact = originalR2 - newR2;
     161            if (impact < smallestImpact) smallestImpact = impact;
     162          }
     163          impacts[inputVariable] = smallestImpact;
     164        } else {
     165          // for replacement methods shuffle and mode
     166          // calculate impacts for factor variables
     167
     168          var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     169            factorReplacementMethod);
     170          var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
     171          if (error != OnlineCalculatorError.None)
     172            throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     173
     174          newR2 = newR2 * newR2;
     175          var impact = originalR2 - newR2;
     176          impacts[inputVariable] = impact;
     177        }
     178      } // foreach
    139179      return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value));
    140180    }
     
    184224      }
    185225
    186       dataset.ReplaceVariable(variable, replacementValues);
     226      return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
     227    }
     228
     229    private static IEnumerable<double> EvaluateModelWithReplacedVariable(
     230      IRegressionModel model, string variable, ModifiableDataset dataset,
     231      IEnumerable<int> rows,
     232      FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) {
     233      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
     234      List<string> replacementValues;
     235      IRandom rand;
     236
     237      switch (replacement) {
     238        case FactorReplacementMethodEnum.Mode:
     239          var mostCommonValue = rows.Select(r => originalValues[r])
     240            .GroupBy(v => v)
     241            .OrderByDescending(g => g.Count())
     242            .First().Key;
     243          replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList();
     244          break;
     245        case FactorReplacementMethodEnum.Shuffle:
     246          // new var has same empirical distribution but the relation to y is broken
     247          rand = new FastRandom(31415);
     248          // prepare a complete column for the dataset
     249          replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList();
     250          // shuffle only the selected rows
     251          var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
     252          int i = 0;
     253          // update column values
     254          foreach (var r in rows) {
     255            replacementValues[r] = shuffledValues[i++];
     256          }
     257          break;
     258        default:
     259          throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement));
     260      }
     261
     262      return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
     263    }
     264
     265    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
     266      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<double> replacementValues) {
     267      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
     268      dataset.ReplaceVariable(variable, replacementValues.ToList());
    187269      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
    188270      var estimates = model.GetEstimatedValues(dataset, rows).ToList();
     
    191273      return estimates;
    192274    }
     275    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
     276      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<string> replacementValues) {
     277      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
     278      dataset.ReplaceVariable(variable, replacementValues.ToList());
     279      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
     280      var estimates = model.GetEstimatedValues(dataset, rows).ToList();
     281      dataset.ReplaceVariable(variable, originalValues);
     282
     283      return estimates;
     284    }
    193285  }
    194286}
  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Interfaces/IDataset.cs

    r14185 r14826  
    3030    IEnumerable<string> VariableNames { get; }
    3131    IEnumerable<string> DoubleVariables { get; }
     32    IEnumerable<string> StringVariables { get; }
     33
     34    bool VariableHasType<T>(string variableName);
    3235
    3336    double GetDoubleValue(string variableName, int row);
     
    3639    ReadOnlyCollection<double> GetReadOnlyDoubleValues(string variableName);
    3740
     41    string GetStringValue(string variableName, int row);
    3842    IEnumerable<string> GetStringValues(string variableName);
     43    IEnumerable<string> GetStringValues(string variableName, IEnumerable<int> rows);
     44    ReadOnlyCollection<string> GetReadOnlyStringValues(string VariableName);
     45
    3946    IEnumerable<DateTime> GetDateTimeValues(string variableName);
    4047  }
  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/ModifiableDataset.cs

    r13761 r14826  
    7777    public void ReplaceVariable(string variableName, IList values) {
    7878      if (!variableValues.ContainsKey(variableName))
    79         throw new ArgumentException(string.Format("Variable {0} is not present in the dataset."), variableName);
     79        throw new ArgumentException(string.Format("Variable {0} is not present in the dataset.", variableName));
    8080      if (values.Count != variableValues[variableName].Count)
    8181        throw new ArgumentException("The number of values must coincide with the number of dataset rows.");
Note: See TracChangeset for help on using the changeset viewer.