Changeset 16188


Ignore:
Timestamp:
09/27/18 09:50:33 (14 months ago)
Author:
fholzing
Message:

#2904: Merged changes from trunk

Location:
branches/2904_CalculateImpacts
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • branches/2904_CalculateImpacts/3.4

  • branches/2904_CalculateImpacts/3.4/Dataset.cs

    r15769 r16188  
    3838    protected Dataset(Dataset original, Cloner cloner)
    3939      : base(original, cloner) {
     40      // no need to clone the variable values because these can't be modified
    4041      variableValues = new Dictionary<string, IList>(original.variableValues);
    4142      variableNames = new List<string>(original.variableNames);
    4243      rows = original.rows;
    4344    }
     45
    4446    public override IDeepCloneable Clone(Cloner cloner) { return new Dataset(this, cloner); }
    4547
     
    5860    /// <param name="variableValues">The values for the variables (column-oriented storage). Values are not cloned!</param>
    5961    public Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues)
    60       : base() {
     62      : this(variableNames, variableValues, cloneValues: true) {
     63    }
     64
     65    protected Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues, bool cloneValues = false) {
    6166      Name = "-";
    62       if (!variableNames.Any()) {
     67
     68      if (variableNames.Any()) {
     69        this.variableNames = new List<string>(variableNames);
     70      } else {
    6371        this.variableNames = Enumerable.Range(0, variableValues.Count()).Select(x => "Column " + x).ToList();
    64       } else if (variableNames.Count() != variableValues.Count()) {
    65         throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues");
    66       } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) {
    67         throw new ArgumentException("The number of values must be equal for every variable");
    68       } else if (variableNames.Distinct().Count() != variableNames.Count()) {
    69         var duplicateVariableNames =
    70           variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList();
    71         string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine;
    72         foreach (var duplicateVariableName in duplicateVariableNames)
    73           message += duplicateVariableName + Environment.NewLine;
    74         throw new ArgumentException(message);
    75       }
     72      }
     73      // check if the arguments are consistent (no duplicate variables, same number of rows, correct data types, ...)
     74      CheckArguments(this.variableNames, variableValues);
     75
    7676      rows = variableValues.First().Count;
    77       this.variableNames = new List<string>(variableNames);
    78       this.variableValues = new Dictionary<string, IList>(this.variableNames.Count);
    79       for (int i = 0; i < this.variableNames.Count; i++) {
    80         var variableName = this.variableNames[i];
    81         var values = variableValues.ElementAt(i);
    82 
    83         if (!IsAllowedType(values)) {
    84           throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName));
     77
     78      if (cloneValues) {
     79        this.variableValues = CloneValues(this.variableNames, variableValues);
     80      } else {
     81        this.variableValues = new Dictionary<string, IList>(this.variableNames.Count);
     82        for (int i = 0; i < this.variableNames.Count; i++) {
     83          var variableName = this.variableNames[i];
     84          var values = variableValues.ElementAt(i);
     85          this.variableValues.Add(variableName, values);
    8586        }
    86 
    87         this.variableValues.Add(variableName, values);
    8887      }
    8988    }
     
    117116
    118117    public ModifiableDataset ToModifiable() {
    119       var values = new List<IList>();
    120       foreach (var v in variableNames) {
    121         if (VariableHasType<double>(v)) {
    122           values.Add(new List<double>((IList<double>)variableValues[v]));
    123         } else if (VariableHasType<string>(v)) {
    124           values.Add(new List<string>((IList<string>)variableValues[v]));
    125         } else if (VariableHasType<DateTime>(v)) {
    126           values.Add(new List<DateTime>((IList<DateTime>)variableValues[v]));
    127         } else {
    128           throw new ArgumentException("Unknown variable type.");
    129         }
    130       }
    131       return new ModifiableDataset(variableNames, values);
     118      return new ModifiableDataset(variableNames, variableNames.Select(v => variableValues[v]), true);
    132119    }
    133120
     
    142129    }
    143130
    144     protected Dataset(Dataset dataset) : this(dataset.variableNames, dataset.variableValues.Values) { }
     131
    145132
    146133    #region Backwards compatible code, remove with 3.5
     
    238225      return new ReadOnlyCollection<DateTime>(values);
    239226    }
    240 
    241 
    242227    private IEnumerable<T> GetValues<T>(string variableName, IEnumerable<int> rows) {
    243228      var values = GetValues<T>(variableName);
     
    255240      return variableValues[variableName] is IList<T>;
    256241    }
    257 
    258242    protected Type GetVariableType(string variableName) {
    259243      IList list;
     
    263247      return GetElementType(list);
    264248    }
    265 
    266     protected Type GetElementType(IList list) {
     249    protected static Type GetElementType(IList list) {
    267250      var type = list.GetType();
    268251      return type.IsGenericType ? type.GetGenericArguments()[0] : type.GetElementType();
    269252    }
    270 
    271     protected bool IsAllowedType(IList list) {
     253    protected static bool IsAllowedType(IList list) {
    272254      var type = GetElementType(list);
    273255      return IsAllowedType(type);
    274256    }
    275 
    276     protected bool IsAllowedType(Type type) {
     257    protected static bool IsAllowedType(Type type) {
    277258      return type == typeof(double) || type == typeof(string) || type == typeof(DateTime);
     259    }
     260
     261    protected static void CheckArguments(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) {
     262      if (variableNames.Count() != variableValues.Count()) {
     263        throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues");
     264      } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) {
     265        throw new ArgumentException("The number of values must be equal for every variable");
     266      } else if (variableNames.Distinct().Count() != variableNames.Count()) {
     267        var duplicateVariableNames =
     268          variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList();
     269        string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine;
     270        foreach (var duplicateVariableName in duplicateVariableNames)
     271          message += duplicateVariableName + Environment.NewLine;
     272        throw new ArgumentException(message);
     273      }
     274      // check if all the variables are supported
     275      foreach (var t in variableNames.Zip(variableValues, Tuple.Create)) {
     276        var variableName = t.Item1;
     277        var values = t.Item2;
     278
     279        if (!IsAllowedType(values)) {
     280          throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName));
     281        }
     282      }
     283    }
     284
     285    protected static Dictionary<string, IList> CloneValues(Dictionary<string, IList> variableValues) {
     286      return variableValues.ToDictionary(x => x.Key, x => CloneValues(x.Value));
     287    }
     288
     289    protected static Dictionary<string, IList> CloneValues(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) {
     290      return variableNames.Zip(variableValues, Tuple.Create).ToDictionary(x => x.Item1, x => CloneValues(x.Item2));
     291    }
     292
     293    protected static IList CloneValues(IList values) {
     294      var doubleValues = values as IList<double>;
     295      if (doubleValues != null) return new List<double>(doubleValues);
     296
     297      var stringValues = values as IList<string>;
     298      if (stringValues != null) return new List<string>(stringValues);
     299
     300      var dateTimeValues = values as IList<DateTime>;
     301      if (dateTimeValues != null) return new List<DateTime>(dateTimeValues);
     302
     303      throw new ArgumentException(string.Format("Unsupported variable type {0}.", GetElementType(values)));
    278304    }
    279305
    280306    #region IStringConvertibleMatrix Members
    281307    [Storable]
    282     protected int rows;
     308    private int rows;
    283309    public int Rows {
    284310      get { return rows; }
     311      protected set { rows = value; }
    285312    }
    286313    int IStringConvertibleMatrix.Rows {
  • branches/2904_CalculateImpacts/3.4/Implementation/Classification/ClassificationSolutionVariableImpactsCalculator.cs

    r16181 r16188  
    2323
    2424using System;
    25 using System.Collections;
    2625using System.Collections.Generic;
    2726using System.Linq;
     
    3736  [Item("ClassificationSolution Impacts Calculator", "Calculation of the impacts of input variables for any classification solution")]
    3837  public sealed class ClassificationSolutionVariableImpactsCalculator : ParameterizedNamedItem {
    39     #region Parameters/Properties
    4038    public enum ReplacementMethodEnum {
    4139      Median,
     
    5654
    5755    private const string ReplacementParameterName = "Replacement Method";
    58     private const string FactorReplacementParameterName = "Factor Replacement Method";
    5956    private const string DataPartitionParameterName = "DataPartition";
    6057
    6158    public IFixedValueParameter<EnumValue<ReplacementMethodEnum>> ReplacementParameter {
    6259      get { return (IFixedValueParameter<EnumValue<ReplacementMethodEnum>>)Parameters[ReplacementParameterName]; }
    63     }
    64     public IFixedValueParameter<EnumValue<FactorReplacementMethodEnum>> FactorReplacementParameter {
    65       get { return (IFixedValueParameter<EnumValue<FactorReplacementMethodEnum>>)Parameters[FactorReplacementParameterName]; }
    6660    }
    6761    public IFixedValueParameter<EnumValue<DataPartitionEnum>> DataPartitionParameter {
     
    7367      set { ReplacementParameter.Value.Value = value; }
    7468    }
    75     public FactorReplacementMethodEnum FactorReplacementMethod {
    76       get { return FactorReplacementParameter.Value.Value; }
    77       set { FactorReplacementParameter.Value.Value = value; }
    78     }
    7969    public DataPartitionEnum DataPartition {
    8070      get { return DataPartitionParameter.Value.Value; }
    8171      set { DataPartitionParameter.Value.Value = value; }
    8272    }
    83     #endregion
    84 
    85     #region Ctor/Cloner
     73
     74
    8675    [StorableConstructor]
    8776    private ClassificationSolutionVariableImpactsCalculator(bool deserializing) : base(deserializing) { }
    8877    private ClassificationSolutionVariableImpactsCalculator(ClassificationSolutionVariableImpactsCalculator original, Cloner cloner)
    8978      : base(original, cloner) { }
     79    public override IDeepCloneable Clone(Cloner cloner) {
     80      return new ClassificationSolutionVariableImpactsCalculator(this, cloner);
     81    }
     82
    9083    public ClassificationSolutionVariableImpactsCalculator()
    9184      : base() {
    92       Parameters.Add(new FixedValueParameter<EnumValue<ReplacementMethodEnum>>(ReplacementParameterName, "The replacement method for variables during impact calculation.", new EnumValue<ReplacementMethodEnum>(ReplacementMethodEnum.Shuffle)));
    93       Parameters.Add(new FixedValueParameter<EnumValue<FactorReplacementMethodEnum>>(FactorReplacementParameterName, "The replacement method for factor variables during impact calculation.", new EnumValue<FactorReplacementMethodEnum>(FactorReplacementMethodEnum.Best)));
     85      Parameters.Add(new FixedValueParameter<EnumValue<ReplacementMethodEnum>>(ReplacementParameterName, "The replacement method for variables during impact calculation.", new EnumValue<ReplacementMethodEnum>(ReplacementMethodEnum.Median)));
    9486      Parameters.Add(new FixedValueParameter<EnumValue<DataPartitionEnum>>(DataPartitionParameterName, "The data partition on which the impacts are calculated.", new EnumValue<DataPartitionEnum>(DataPartitionEnum.Training)));
    9587    }
    96 
    97     public override IDeepCloneable Clone(Cloner cloner) {
    98       return new ClassificationSolutionVariableImpactsCalculator(this, cloner);
    99     }
    100     #endregion
    10188
    10289    //mkommend: annoying name clash with static method, open to better naming suggestions
    10390    public IEnumerable<Tuple<string, double>> Calculate(IClassificationSolution solution) {
    104       return CalculateImpacts(solution, ReplacementMethod, FactorReplacementMethod, DataPartition);
     91      return CalculateImpacts(solution, DataPartition, ReplacementMethod);
    10592    }
    10693
    10794    public static IEnumerable<Tuple<string, double>> CalculateImpacts(
    10895      IClassificationSolution solution,
    109       ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle,
    110       FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
    111       DataPartitionEnum dataPartition = DataPartitionEnum.Training) {
    112 
    113       IEnumerable<int> rows = GetPartitionRows(dataPartition, solution.ProblemData);
    114       IEnumerable<double> estimatedClassValues = solution.GetEstimatedClassValues(rows);
    115       return CalculateImpacts(solution.Model, solution.ProblemData, estimatedClassValues, rows, replacementMethod, factorReplacementMethod);
    116     }
    117 
    118     public static IEnumerable<Tuple<string, double>> CalculateImpacts(
    119      IClassificationModel model,
    120      IClassificationProblemData problemData,
    121      IEnumerable<double> estimatedClassValues,
    122      IEnumerable<int> rows,
    123      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle,
    124      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) {
    125 
    126       //fholzing: try and catch in case a different dataset is loaded, otherwise statement is neglectable
    127       var missingVariables = model.VariablesUsedForPrediction.Except(problemData.Dataset.VariableNames);
    128       if (missingVariables.Any()) {
    129         throw new InvalidOperationException(string.Format("Can not calculate variable impacts, because the model uses inputs missing in the dataset ({0})", string.Join(", ", missingVariables)));
    130       }
    131       IEnumerable<double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
    132       var originalQuality = CalculateQuality(targetValues, estimatedClassValues);
     96      DataPartitionEnum data = DataPartitionEnum.Training,
     97      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median,
     98      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) {
     99
     100      var problemData = solution.ProblemData;
     101      var dataset = problemData.Dataset;
     102
     103      IEnumerable<int> rows;
     104      IEnumerable<double> targetValues;
     105      double originalAccuracy;
     106
     107      OnlineCalculatorError error;
     108
     109      switch (data) {
     110        case DataPartitionEnum.All:
     111          rows = problemData.AllIndices;
     112          targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.AllIndices).ToList();
     113          originalAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, solution.EstimatedClassValues, out error);
     114          if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during accuracy calculation.");
     115          break;
     116        case DataPartitionEnum.Training:
     117          rows = problemData.TrainingIndices;
     118          targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToList();
     119          originalAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, solution.EstimatedTrainingClassValues, out error);
     120          if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during accuracy calculation.");
     121          break;
     122        case DataPartitionEnum.Test:
     123          rows = problemData.TestIndices;
     124          targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToList();
     125          originalAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, solution.EstimatedTestClassValues, out error);
     126          if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during accuracy calculation.");
     127          break;
     128        default: throw new ArgumentException(string.Format("DataPartition {0} cannot be handled.", data));
     129      }
    133130
    134131      var impacts = new Dictionary<string, double>();
    135       var inputvariables = new HashSet<string>(problemData.AllowedInputVariables.Union(model.VariablesUsedForPrediction));
    136       var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable();
    137 
    138       foreach (var inputVariable in inputvariables) {
    139         impacts[inputVariable] = CalculateImpact(inputVariable, model, problemData, modifiableDataset, rows, replacementMethod, factorReplacementMethod, targetValues, originalQuality);
    140       }
    141 
    142       return impacts.Select(i => Tuple.Create(i.Key, i.Value));
    143     }
    144 
    145     public static double CalculateImpact(string variableName,
    146       IClassificationModel model,
    147       IClassificationProblemData problemData,
    148       ModifiableDataset modifiableDataset,
    149       IEnumerable<int> rows,
    150       ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle,
    151       FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
    152       IEnumerable<double> targetValues = null,
    153       double quality = double.NaN) {
    154 
    155       if (!model.VariablesUsedForPrediction.Contains(variableName)) { return 0.0; }
    156       if (!problemData.Dataset.VariableNames.Contains(variableName)) {
    157         throw new InvalidOperationException(string.Format("Can not calculate variable impact, because the model uses inputs missing in the dataset ({0})", variableName));
    158       }
    159 
    160       if (targetValues == null) {
    161         targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
    162       }
    163       if (quality == double.NaN) {
    164         quality = CalculateQuality(model.GetEstimatedClassValues(modifiableDataset, rows), targetValues);
    165       }
    166 
    167       IList originalValues = null;
    168       IList replacementValues = GetReplacementValues(modifiableDataset, variableName, model, rows, targetValues, out originalValues, replacementMethod, factorReplacementMethod);
    169 
    170       double newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, replacementValues, targetValues);
    171       double impact = quality - newValue;
    172 
    173       return impact;
    174     }
    175 
    176     private static IList GetReplacementValues(ModifiableDataset modifiableDataset,
    177       string variableName,
    178       IClassificationModel model,
    179       IEnumerable<int> rows,
    180       IEnumerable<double> targetValues,
    181       out IList originalValues,
    182       ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle,
    183       FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) {
    184 
    185       IList replacementValues = null;
    186       if (modifiableDataset.VariableHasType<double>(variableName)) {
    187         originalValues = modifiableDataset.GetReadOnlyDoubleValues(variableName).ToList();
    188         replacementValues = GetReplacementValuesForDouble(modifiableDataset, rows, (List<double>)originalValues, replacementMethod);
    189       } else if (modifiableDataset.VariableHasType<string>(variableName)) {
    190         originalValues = modifiableDataset.GetReadOnlyStringValues(variableName).ToList();
    191         replacementValues = GetReplacementValuesForString(model, modifiableDataset, variableName, rows, (List<string>)originalValues, targetValues, factorReplacementMethod);
    192       } else {
    193         throw new NotSupportedException("Variable not supported");
    194       }
    195 
    196       return replacementValues;
    197     }
    198 
    199     private static IList GetReplacementValuesForDouble(ModifiableDataset modifiableDataset,
    200       IEnumerable<int> rows,
    201       List<double> originalValues,
    202       ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle) {
    203 
    204       IRandom random = new FastRandom(31415);
     132      var modifiableDataset = ((Dataset)dataset).ToModifiable();
     133
     134      var inputvariables = new HashSet<string>(problemData.AllowedInputVariables.Union(solution.Model.VariablesUsedForPrediction));
     135      var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList();
     136
     137      // calculate impacts for double variables
     138      foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
     139        var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);
     140        var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error);
     141        if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     142
     143        impacts[inputVariable] = originalAccuracy - newAccuracy;
     144      }
     145
     146      // calculate impacts for string variables
     147      foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) {
     148        if (factorReplacementMethod == FactorReplacementMethodEnum.Best) {
     149          // try replacing with all possible values and find the best replacement value
     150          var smallestImpact = double.PositiveInfinity;
     151          foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) {
     152            var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     153              Enumerable.Repeat(repl, dataset.Rows));
     154            var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error);
     155            if (error != OnlineCalculatorError.None)
     156              throw new InvalidOperationException("Error during accuracy calculation with replaced inputs.");
     157
     158            var impact = originalAccuracy - newAccuracy;
     159            if (impact < smallestImpact) smallestImpact = impact;
     160          }
     161          impacts[inputVariable] = smallestImpact;
     162        } else {
     163          // for replacement methods shuffle and mode
     164          // calculate impacts for factor variables
     165
     166          var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     167            factorReplacementMethod);
     168          var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error);
     169          if (error != OnlineCalculatorError.None)
     170            throw new InvalidOperationException("Error during accuracy calculation with replaced inputs.");
     171
     172          impacts[inputVariable] = originalAccuracy - newAccuracy;
     173        }
     174      } // foreach
     175      return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value));
     176    }
     177
     178    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IClassificationModel model, string variable, ModifiableDataset dataset, IEnumerable<int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) {
     179      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
     180      double replacementValue;
    205181      List<double> replacementValues;
    206       double replacementValue;
    207 
    208       switch (replacementMethod) {
     182      IRandom rand;
     183
     184      switch (replacement) {
    209185        case ReplacementMethodEnum.Median:
    210186          replacementValue = rows.Select(r => originalValues[r]).Median();
    211           replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();
     187          replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
    212188          break;
    213189        case ReplacementMethodEnum.Average:
    214190          replacementValue = rows.Select(r => originalValues[r]).Average();
    215           replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();
     191          replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
    216192          break;
    217193        case ReplacementMethodEnum.Shuffle:
    218194          // new var has same empirical distribution but the relation to y is broken
     195          rand = new FastRandom(31415);
    219196          // prepare a complete column for the dataset
    220           replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();
     197          replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
    221198          // shuffle only the selected rows
    222           var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList();
     199          var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
    223200          int i = 0;
    224201          // update column values
     
    230207          var avg = rows.Select(r => originalValues[r]).Average();
    231208          var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
     209          rand = new FastRandom(31415);
    232210          // prepare a complete column for the dataset
    233           replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();
     211          replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
    234212          // update column values
    235213          foreach (var r in rows) {
    236             replacementValues[r] = NormalDistributedRandom.NextDouble(random, avg, stdDev);
     214            replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev);
    237215          }
    238216          break;
    239217
    240218        default:
    241           throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacementMethod));
    242       }
    243 
    244       return replacementValues;
    245     }
    246 
    247     private static IList GetReplacementValuesForString(IClassificationModel model,
    248       ModifiableDataset modifiableDataset,
    249       string variableName,
     219          throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement));
     220      }
     221
     222      return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
     223    }
     224
     225    private static IEnumerable<double> EvaluateModelWithReplacedVariable(
     226      IClassificationModel model, string variable, ModifiableDataset dataset,
    250227      IEnumerable<int> rows,
    251       List<string> originalValues,
    252       IEnumerable<double> targetValues,
    253       FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Shuffle) {
    254 
    255       List<string> replacementValues = null;
    256       IRandom random = new FastRandom(31415);
    257 
    258       switch (factorReplacementMethod) {
    259         case FactorReplacementMethodEnum.Best:
    260           // try replacing with all possible values and find the best replacement value
    261           var bestQuality = double.NegativeInfinity;
    262           foreach (var repl in modifiableDataset.GetStringValues(variableName, rows).Distinct()) {
    263             List<string> curReplacementValues = Enumerable.Repeat(repl, modifiableDataset.Rows).ToList();
    264             //fholzing: this result could be used later on (theoretically), but is neglected for better readability/method consistency
    265             var newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, curReplacementValues, targetValues);
    266             var curQuality = newValue;
    267 
    268             if (curQuality > bestQuality) {
    269               bestQuality = curQuality;
    270               replacementValues = curReplacementValues;
    271             }
    272           }
    273           break;
     228      FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) {
     229      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
     230      List<string> replacementValues;
     231      IRandom rand;
     232
     233      switch (replacement) {
    274234        case FactorReplacementMethodEnum.Mode:
    275235          var mostCommonValue = rows.Select(r => originalValues[r])
     
    277237            .OrderByDescending(g => g.Count())
    278238            .First().Key;
    279           replacementValues = Enumerable.Repeat(mostCommonValue, modifiableDataset.Rows).ToList();
     239          replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList();
    280240          break;
    281241        case FactorReplacementMethodEnum.Shuffle:
    282242          // new var has same empirical distribution but the relation to y is broken
     243          rand = new FastRandom(31415);
    283244          // prepare a complete column for the dataset
    284           replacementValues = Enumerable.Repeat(string.Empty, modifiableDataset.Rows).ToList();
     245          replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList();
    285246          // shuffle only the selected rows
    286           var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList();
     247          var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
    287248          int i = 0;
    288249          // update column values
     
    292253          break;
    293254        default:
    294           throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", factorReplacementMethod));
    295       }
    296 
    297       return replacementValues;
    298     }
    299 
    300     private static double CalculateQualityForReplacement(
    301       IClassificationModel model,
    302       ModifiableDataset modifiableDataset,
    303       string variableName,
    304       IList originalValues,
    305       IEnumerable<int> rows,
    306       IList replacementValues,
    307       IEnumerable<double> targetValues) {
    308 
    309       modifiableDataset.ReplaceVariable(variableName, replacementValues);
    310       var discModel = model as IDiscriminantFunctionClassificationModel;
    311       if (discModel != null) {
    312         var problemData = new ClassificationProblemData(modifiableDataset, modifiableDataset.VariableNames, model.TargetVariable);
    313         discModel.RecalculateModelParameters(problemData, rows);
    314       }
    315 
     255          throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement));
     256      }
     257
     258      return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues);
     259    }
     260
     261    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IClassificationModel model, string variable,
     262      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<double> replacementValues) {
     263      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
     264      dataset.ReplaceVariable(variable, replacementValues.ToList());
    316265      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
    317       var estimates = model.GetEstimatedClassValues(modifiableDataset, rows).ToList();
    318       var ret = CalculateQuality(targetValues, estimates);
    319       modifiableDataset.ReplaceVariable(variableName, originalValues);
    320 
    321       return ret;
    322     }
    323 
    324     public static double CalculateQuality(IEnumerable<double> targetValues, IEnumerable<double> estimatedClassValues) {
    325       OnlineCalculatorError errorState;
    326       var ret = OnlineAccuracyCalculator.Calculate(targetValues, estimatedClassValues, out errorState);
    327       if (errorState != OnlineCalculatorError.None) { throw new InvalidOperationException("Error during calculation with replaced inputs."); }
    328       return ret;
    329     }
    330 
    331     public static IEnumerable<int> GetPartitionRows(DataPartitionEnum dataPartition, IClassificationProblemData problemData) {
    332       IEnumerable<int> rows;
    333 
    334       switch (dataPartition) {
    335         case DataPartitionEnum.All:
    336           rows = problemData.AllIndices;
    337           break;
    338         case DataPartitionEnum.Test:
    339           rows = problemData.TestIndices;
    340           break;
    341         case DataPartitionEnum.Training:
    342           rows = problemData.TrainingIndices;
    343           break;
    344         default:
    345           throw new NotSupportedException("DataPartition not supported");
    346       }
    347 
    348       return rows;
     266      var estimates = model.GetEstimatedClassValues(dataset, rows).ToList();
     267      dataset.ReplaceVariable(variable, originalValues);
     268
     269      return estimates;
     270    }
     271    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IClassificationModel model, string variable,
     272      ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<string> replacementValues) {
     273      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
     274      dataset.ReplaceVariable(variable, replacementValues.ToList());
     275      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
     276      var estimates = model.GetEstimatedClassValues(dataset, rows).ToList();
     277      dataset.ReplaceVariable(variable, originalValues);
     278
     279      return estimates;
    349280    }
    350281  }
  • branches/2904_CalculateImpacts/3.4/Implementation/DataAnalysisProblemData.cs

    r15583 r16188  
    163163
    164164      var variables = dataset.VariableNames.Where(variable => dataset.VariableHasType<double>(variable) || dataset.VariableHasType<string>(variable));
    165       var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x)));
     165      var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x).AsReadOnly()));
    166166      foreach (StringValue x in inputVariables)
    167167        inputVariables.SetItemCheckedState(x, allowedInputVariables.Contains(x.Value));
  • branches/2904_CalculateImpacts/3.4/ModifiableDataset.cs

    r15769 r16188  
    3939
    4040    private ModifiableDataset(ModifiableDataset original, Cloner cloner) : base(original, cloner) {
    41       var variables = variableValues.Keys.ToList();
    42       foreach (var v in variables) {
    43         var type = GetVariableType(v);
    44         if (type == typeof(DateTime)) {
    45           variableValues[v] = GetDateTimeValues(v).ToList();
    46         } else if (type == typeof(double)) {
    47           variableValues[v] = GetDoubleValues(v).ToList();
    48         } else if (type == typeof(string)) {
    49           variableValues[v] = GetStringValues(v).ToList();
    50         } else {
    51           throw new ArgumentException("Unsupported type " + type + " for variable " + v);
     41      variableNames = new List<string>(original.variableNames);
     42      variableValues = CloneValues(original.variableValues);
     43    }
     44
     45    public override IDeepCloneable Clone(Cloner cloner) { return new ModifiableDataset(this, cloner); }
     46
     47    public ModifiableDataset() { }
     48
     49    public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues, bool cloneValues = false) :
     50      base(variableNames, variableValues, cloneValues) { }
     51
     52    public Dataset ToDataset() {
     53      return new Dataset(variableNames, variableNames.Select(v => variableValues[v]));
     54    }
     55
     56
     57    public IEnumerable<object> GetRow(int row) {
     58      return variableValues.Select(x => x.Value[row]);
     59    }
     60
     61    public void AddRow(IEnumerable<object> values) {
     62      var list = values.ToList();
     63      if (list.Count != variableNames.Count)
     64        throw new ArgumentException("The number of values must be equal to the number of variable names.");
     65      // check if all the values are of the correct type
     66      for (int i = 0; i < list.Count; ++i) {
     67        if (list[i].GetType() != GetVariableType(variableNames[i])) {
     68          throw new ArgumentException("The type of the provided value does not match the variable type.");
    5269        }
    5370      }
    54     }
    55     public override IDeepCloneable Clone(Cloner cloner) { return new ModifiableDataset(this, cloner); }
    56     public ModifiableDataset() : base() { }
    57 
    58     public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) : base(variableNames, variableValues) { }
     71      // add values
     72      for (int i = 0; i < list.Count; ++i) {
     73        variableValues[variableNames[i]].Add(list[i]);
     74      }
     75      Rows++;
     76      OnRowsChanged();
     77      OnReset();
     78    }
    5979
    6080    public void ReplaceRow(int row, IEnumerable<object> values) {
     
    7292        variableValues[variableNames[i]][row] = list[i];
    7393      }
     94      OnReset();
     95    }
     96
     97    // slow, avoid using this
     98    public void RemoveRow(int row) {
     99      foreach (var list in variableValues.Values)
     100        list.RemoveAt(row);
     101      Rows--;
     102      OnRowsChanged();
     103      OnReset();
     104    }
     105
     106    // adds a new variable to the dataset
     107    public void AddVariable(string variableName, IList values) {
     108      InsertVariable(variableName, Columns, values);
     109    }
     110
     111    public void InsertVariable(string variableName, int position, IList values) {
     112      if (variableValues.ContainsKey(variableName))
     113        throw new ArgumentException(string.Format("Variable {0} is already present in the dataset.", variableName));
     114
     115      if (position < 0 || position > Columns)
     116        throw new ArgumentException(string.Format("Incorrect position {0} specified. The position must be between 0 and {1}.", position, Columns));
     117
     118      if (values == null)
     119        throw new ArgumentNullException("values", "Values must not be null. At least an empty list of values has to be provided.");
     120
     121      if (values.Count != Rows)
     122        throw new ArgumentException(string.Format("{0} values are provided, but {1} rows are present in the dataset.", values.Count, Rows));
     123
     124      if (!IsAllowedType(values))
     125        throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName));
     126
     127      variableNames.Insert(position, variableName);
     128      variableValues[variableName] = values;
     129
     130      OnColumnsChanged();
     131      OnColumnNamesChanged();
    74132      OnReset();
    75133    }
     
    85143    }
    86144
    87     public void AddRow(IEnumerable<object> values) {
    88       var list = values.ToList();
    89       if (list.Count != variableNames.Count)
    90         throw new ArgumentException("The number of values must be equal to the number of variable names.");
    91       // check if all the values are of the correct type
    92       for (int i = 0; i < list.Count; ++i) {
    93         if (list[i].GetType() != GetVariableType(variableNames[i])) {
    94           throw new ArgumentException("The type of the provided value does not match the variable type.");
    95         }
    96       }
    97       // add values
    98       for (int i = 0; i < list.Count; ++i) {
    99         variableValues[variableNames[i]].Add(list[i]);
    100       }
    101       rows++;
    102       OnRowsChanged();
    103       OnReset();
    104     }
    105 
    106     // adds a new variable to the dataset
    107     public void AddVariable(string variableName, IList values) {
    108       if (variableValues.ContainsKey(variableName))
    109         throw new ArgumentException(string.Format("Variable {0} is already present in the dataset.", variableName));
    110 
    111       if (values == null || values.Count == 0)
    112         throw new ArgumentException("Cannot add variable with no values.");
    113 
    114       if (!IsAllowedType(values))
    115         throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName));
    116 
    117       variableValues[variableName] = values;
    118       variableNames.Add(variableName);
    119 
    120       OnColumnsChanged();
    121       OnColumnNamesChanged();
    122       OnReset();
    123     }
    124145
    125146    public void RemoveVariable(string variableName) {
     
    133154    }
    134155
    135     // slow, avoid using this
    136     public void RemoveRow(int row) {
    137       foreach (var list in variableValues.Values)
    138         list.RemoveAt(row);
    139       rows--;
     156    public void ClearValues() {
     157      foreach (var list in variableValues.Values) {
     158        list.Clear();
     159      }
     160      Rows = 0;
    140161      OnRowsChanged();
    141162      OnReset();
    142163    }
     164
    143165
    144166    public void SetVariableValue(object value, string variableName, int row) {
  • branches/2904_CalculateImpacts/HeuristicLab.Problems.DataAnalysis.Views/3.4

    • Property svn:mergeinfo set to (toggle deleted branches)
      /stable/HeuristicLab.Problems.DataAnalysis.Views/3.4mergedeligible
      /trunk/HeuristicLab.Problems.DataAnalysis.Views/3.4mergedeligible
      /branches/Async/HeuristicLab.Problems.DataAnalysis.Views/3.413329-15286
      /branches/Benchmarking/sources/HeuristicLab.Problems.DataAnalysis.Views/3.46917-7005
      /branches/ClassificationModelComparison/HeuristicLab.Problems.DataAnalysis.Views/3.49116-13099
      /branches/CloningRefactoring/HeuristicLab.Problems.DataAnalysis.Views/3.44656-4721
      /branches/DataAnalysis Refactoring/HeuristicLab.Problems.DataAnalysis.Views/3.45471-5808
      /branches/DataAnalysis SolutionEnsembles/HeuristicLab.Problems.DataAnalysis.Views/3.45815-6180
      /branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Views/3.44458-4459,​4462,​4464
      /branches/DataPreprocessing/HeuristicLab.Problems.DataAnalysis.Views/3.410085-11101
      /branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis.Views/3.48036-8538
      /branches/GP.Grammar.Editor/HeuristicLab.Problems.DataAnalysis.Views/3.46284-6795
      /branches/GP.Symbols (TimeLag, Diff, Integral)/HeuristicLab.Problems.DataAnalysis.Views/3.45060
      /branches/HeuristicLab.DatasetRefactor/sources/HeuristicLab.Problems.DataAnalysis.Views/3.411570-12508
      /branches/HeuristicLab.Problems.Orienteering/HeuristicLab.Problems.DataAnalysis.Views/3.411130-12721
      /branches/HeuristicLab.RegressionSolutionGradientView/HeuristicLab.Problems.DataAnalysis.Views/3.413780-14091
      /branches/HeuristicLab.TimeSeries/HeuristicLab.Problems.DataAnalysis.Views/3.47098-8789
      /branches/NET40/sources/HeuristicLab.Problems.DataAnalysis.Views/3.45138-5162
      /branches/ParallelEngine/HeuristicLab.Problems.DataAnalysis.Views/3.45175-5192
      /branches/ProblemInstancesRegressionAndClassification/HeuristicLab.Problems.DataAnalysis.Views/3.47568-7810
      /branches/QAPAlgorithms/HeuristicLab.Problems.DataAnalysis.Views/3.46350-6627
      /branches/Restructure trunk solution/HeuristicLab.Problems.DataAnalysis.Views/3.46828
      /branches/SimplifierViewsProgress/HeuristicLab.Problems.DataAnalysis.Views/3.415318-15370
      /branches/SuccessProgressAnalysis/HeuristicLab.Problems.DataAnalysis.Views/3.45370-5682
      /branches/Trunk/HeuristicLab.Problems.DataAnalysis.Views/3.46829-6865
      /branches/VNS/HeuristicLab.Problems.DataAnalysis.Views/3.45594-5752
      /branches/histogram/HeuristicLab.Problems.DataAnalysis.Views/3.45959-6341
      /branches/symbreg-factors-2650/HeuristicLab.Problems.DataAnalysis.Views/3.414232-14825
  • branches/2904_CalculateImpacts/HeuristicLab.Tests

Note: See TracChangeset for help on using the changeset viewer.