Changeset 16188
- Timestamp:
- 09/27/18 09:50:33 (6 years ago)
- Location:
- branches/2904_CalculateImpacts
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2904_CalculateImpacts/3.4
- Property svn:mergeinfo changed
/trunk/HeuristicLab.Problems.DataAnalysis/3.4 (added) merged: 15829,15871,16059,16063,16084,16118,16120
- Property svn:mergeinfo changed
-
branches/2904_CalculateImpacts/3.4/Dataset.cs
r15769 r16188 38 38 protected Dataset(Dataset original, Cloner cloner) 39 39 : base(original, cloner) { 40 // no need to clone the variable values because these can't be modified 40 41 variableValues = new Dictionary<string, IList>(original.variableValues); 41 42 variableNames = new List<string>(original.variableNames); 42 43 rows = original.rows; 43 44 } 45 44 46 public override IDeepCloneable Clone(Cloner cloner) { return new Dataset(this, cloner); } 45 47 … … 58 60 /// <param name="variableValues">The values for the variables (column-oriented storage). Values are not cloned!</param> 59 61 public Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) 60 : base() { 62 : this(variableNames, variableValues, cloneValues: true) { 63 } 64 65 protected Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues, bool cloneValues = false) { 61 66 Name = "-"; 62 if (!variableNames.Any()) { 67 68 if (variableNames.Any()) { 69 this.variableNames = new List<string>(variableNames); 70 } else { 63 71 this.variableNames = Enumerable.Range(0, variableValues.Count()).Select(x => "Column " + x).ToList(); 64 } else if (variableNames.Count() != variableValues.Count()) { 65 throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues"); 66 } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) { 67 throw new ArgumentException("The number of values must be equal for every variable"); 68 } else if (variableNames.Distinct().Count() != variableNames.Count()) { 69 var duplicateVariableNames = 70 variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList(); 71 string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine; 72 foreach (var duplicateVariableName in duplicateVariableNames) 73 message += duplicateVariableName + Environment.NewLine; 74 throw new ArgumentException(message); 75 } 72 } 73 // check if the arguments are consistent (no duplicate variables, same number of rows, correct data types, ...) 74 CheckArguments(this.variableNames, variableValues); 75 76 76 rows = variableValues.First().Count; 77 this.variableNames = new List<string>(variableNames); 78 this.variableValues = new Dictionary<string, IList>(this.variableNames.Count); 79 for (int i = 0; i < this.variableNames.Count; i++) { 80 var variableName = this.variableNames[i]; 81 var values = variableValues.ElementAt(i); 82 83 if (!IsAllowedType(values)) { 84 throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName)); 77 78 if (cloneValues) { 79 this.variableValues = CloneValues(this.variableNames, variableValues); 80 } else { 81 this.variableValues = new Dictionary<string, IList>(this.variableNames.Count); 82 for (int i = 0; i < this.variableNames.Count; i++) { 83 var variableName = this.variableNames[i]; 84 var values = variableValues.ElementAt(i); 85 this.variableValues.Add(variableName, values); 85 86 } 86 87 this.variableValues.Add(variableName, values);88 87 } 89 88 } … … 117 116 118 117 public ModifiableDataset ToModifiable() { 119 var values = new List<IList>(); 120 foreach (var v in variableNames) { 121 if (VariableHasType<double>(v)) { 122 values.Add(new List<double>((IList<double>)variableValues[v])); 123 } else if (VariableHasType<string>(v)) { 124 values.Add(new List<string>((IList<string>)variableValues[v])); 125 } else if (VariableHasType<DateTime>(v)) { 126 values.Add(new List<DateTime>((IList<DateTime>)variableValues[v])); 127 } else { 128 throw new ArgumentException("Unknown variable type."); 129 } 130 } 131 return new ModifiableDataset(variableNames, values); 118 return new ModifiableDataset(variableNames, variableNames.Select(v => variableValues[v]), true); 132 119 } 133 120 … … 142 129 } 143 130 144 protected Dataset(Dataset dataset) : this(dataset.variableNames, dataset.variableValues.Values) { } 131 145 132 146 133 #region Backwards compatible code, remove with 3.5 … … 238 225 return new ReadOnlyCollection<DateTime>(values); 239 226 } 240 241 242 227 private IEnumerable<T> GetValues<T>(string variableName, IEnumerable<int> rows) { 243 228 var values = GetValues<T>(variableName); … … 255 240 return variableValues[variableName] is IList<T>; 256 241 } 257 258 242 protected Type GetVariableType(string variableName) { 259 243 IList list; … … 263 247 return GetElementType(list); 264 248 } 265 266 protected Type GetElementType(IList list) { 249 protected static Type GetElementType(IList list) { 267 250 var type = list.GetType(); 268 251 return type.IsGenericType ? type.GetGenericArguments()[0] : type.GetElementType(); 269 252 } 270 271 protected bool IsAllowedType(IList list) { 253 protected static bool IsAllowedType(IList list) { 272 254 var type = GetElementType(list); 273 255 return IsAllowedType(type); 274 256 } 275 276 protected bool IsAllowedType(Type type) { 257 protected static bool IsAllowedType(Type type) { 277 258 return type == typeof(double) || type == typeof(string) || type == typeof(DateTime); 259 } 260 261 protected static void CheckArguments(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) { 262 if (variableNames.Count() != variableValues.Count()) { 263 throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues"); 264 } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) { 265 throw new ArgumentException("The number of values must be equal for every variable"); 266 } else if (variableNames.Distinct().Count() != variableNames.Count()) { 267 var duplicateVariableNames = 268 variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList(); 269 string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine; 270 foreach (var duplicateVariableName in duplicateVariableNames) 271 message += duplicateVariableName + Environment.NewLine; 272 throw new ArgumentException(message); 273 } 274 // check if all the variables are supported 275 foreach (var t in variableNames.Zip(variableValues, Tuple.Create)) { 276 var variableName = t.Item1; 277 var values = t.Item2; 278 279 if (!IsAllowedType(values)) { 280 throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName)); 281 } 282 } 283 } 284 285 protected static Dictionary<string, IList> CloneValues(Dictionary<string, IList> variableValues) { 286 return variableValues.ToDictionary(x => x.Key, x => CloneValues(x.Value)); 287 } 288 289 protected static Dictionary<string, IList> CloneValues(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) { 290 return variableNames.Zip(variableValues, Tuple.Create).ToDictionary(x => x.Item1, x => CloneValues(x.Item2)); 291 } 292 293 protected static IList CloneValues(IList values) { 294 var doubleValues = values as IList<double>; 295 if (doubleValues != null) return new List<double>(doubleValues); 296 297 var stringValues = values as IList<string>; 298 if (stringValues != null) return new List<string>(stringValues); 299 300 var dateTimeValues = values as IList<DateTime>; 301 if (dateTimeValues != null) return new List<DateTime>(dateTimeValues); 302 303 throw new ArgumentException(string.Format("Unsupported variable type {0}.", GetElementType(values))); 278 304 } 279 305 280 306 #region IStringConvertibleMatrix Members 281 307 [Storable] 282 pr otectedint rows;308 private int rows; 283 309 public int Rows { 284 310 get { return rows; } 311 protected set { rows = value; } 285 312 } 286 313 int IStringConvertibleMatrix.Rows { -
branches/2904_CalculateImpacts/3.4/Implementation/Classification/ClassificationSolutionVariableImpactsCalculator.cs
r16181 r16188 23 23 24 24 using System; 25 using System.Collections;26 25 using System.Collections.Generic; 27 26 using System.Linq; … … 37 36 [Item("ClassificationSolution Impacts Calculator", "Calculation of the impacts of input variables for any classification solution")] 38 37 public sealed class ClassificationSolutionVariableImpactsCalculator : ParameterizedNamedItem { 39 #region Parameters/Properties40 38 public enum ReplacementMethodEnum { 41 39 Median, … … 56 54 57 55 private const string ReplacementParameterName = "Replacement Method"; 58 private const string FactorReplacementParameterName = "Factor Replacement Method";59 56 private const string DataPartitionParameterName = "DataPartition"; 60 57 61 58 public IFixedValueParameter<EnumValue<ReplacementMethodEnum>> ReplacementParameter { 62 59 get { return (IFixedValueParameter<EnumValue<ReplacementMethodEnum>>)Parameters[ReplacementParameterName]; } 63 }64 public IFixedValueParameter<EnumValue<FactorReplacementMethodEnum>> FactorReplacementParameter {65 get { return (IFixedValueParameter<EnumValue<FactorReplacementMethodEnum>>)Parameters[FactorReplacementParameterName]; }66 60 } 67 61 public IFixedValueParameter<EnumValue<DataPartitionEnum>> DataPartitionParameter { … … 73 67 set { ReplacementParameter.Value.Value = value; } 74 68 } 75 public FactorReplacementMethodEnum FactorReplacementMethod {76 get { return FactorReplacementParameter.Value.Value; }77 set { FactorReplacementParameter.Value.Value = value; }78 }79 69 public DataPartitionEnum DataPartition { 80 70 get { return DataPartitionParameter.Value.Value; } 81 71 set { DataPartitionParameter.Value.Value = value; } 82 72 } 83 #endregion 84 85 #region Ctor/Cloner 73 74 86 75 [StorableConstructor] 87 76 private ClassificationSolutionVariableImpactsCalculator(bool deserializing) : base(deserializing) { } 88 77 private ClassificationSolutionVariableImpactsCalculator(ClassificationSolutionVariableImpactsCalculator original, Cloner cloner) 89 78 : base(original, cloner) { } 79 public override IDeepCloneable Clone(Cloner cloner) { 80 return new ClassificationSolutionVariableImpactsCalculator(this, cloner); 81 } 82 90 83 public ClassificationSolutionVariableImpactsCalculator() 91 84 : base() { 92 Parameters.Add(new FixedValueParameter<EnumValue<ReplacementMethodEnum>>(ReplacementParameterName, "The replacement method for variables during impact calculation.", new EnumValue<ReplacementMethodEnum>(ReplacementMethodEnum.Shuffle))); 93 Parameters.Add(new FixedValueParameter<EnumValue<FactorReplacementMethodEnum>>(FactorReplacementParameterName, "The replacement method for factor variables during impact calculation.", new EnumValue<FactorReplacementMethodEnum>(FactorReplacementMethodEnum.Best))); 85 Parameters.Add(new FixedValueParameter<EnumValue<ReplacementMethodEnum>>(ReplacementParameterName, "The replacement method for variables during impact calculation.", new EnumValue<ReplacementMethodEnum>(ReplacementMethodEnum.Median))); 94 86 Parameters.Add(new FixedValueParameter<EnumValue<DataPartitionEnum>>(DataPartitionParameterName, "The data partition on which the impacts are calculated.", new EnumValue<DataPartitionEnum>(DataPartitionEnum.Training))); 95 87 } 96 97 public override IDeepCloneable Clone(Cloner cloner) {98 return new ClassificationSolutionVariableImpactsCalculator(this, cloner);99 }100 #endregion101 88 102 89 //mkommend: annoying name clash with static method, open to better naming suggestions 103 90 public IEnumerable<Tuple<string, double>> Calculate(IClassificationSolution solution) { 104 return CalculateImpacts(solution, ReplacementMethod, FactorReplacementMethod, DataPartition);91 return CalculateImpacts(solution, DataPartition, ReplacementMethod); 105 92 } 106 93 107 94 public static IEnumerable<Tuple<string, double>> CalculateImpacts( 108 95 IClassificationSolution solution, 109 ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, 110 FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best, 111 DataPartitionEnum dataPartition = DataPartitionEnum.Training) { 112 113 IEnumerable<int> rows = GetPartitionRows(dataPartition, solution.ProblemData); 114 IEnumerable<double> estimatedClassValues = solution.GetEstimatedClassValues(rows); 115 return CalculateImpacts(solution.Model, solution.ProblemData, estimatedClassValues, rows, replacementMethod, factorReplacementMethod); 116 } 117 118 public static IEnumerable<Tuple<string, double>> CalculateImpacts( 119 IClassificationModel model, 120 IClassificationProblemData problemData, 121 IEnumerable<double> estimatedClassValues, 122 IEnumerable<int> rows, 123 ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, 124 FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { 125 126 //fholzing: try and catch in case a different dataset is loaded, otherwise statement is neglectable 127 var missingVariables = model.VariablesUsedForPrediction.Except(problemData.Dataset.VariableNames); 128 if (missingVariables.Any()) { 129 throw new InvalidOperationException(string.Format("Can not calculate variable impacts, because the model uses inputs missing in the dataset ({0})", string.Join(", ", missingVariables))); 130 } 131 IEnumerable<double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); 132 var originalQuality = CalculateQuality(targetValues, estimatedClassValues); 96 DataPartitionEnum data = DataPartitionEnum.Training, 97 ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median, 98 FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { 99 100 var problemData = solution.ProblemData; 101 var dataset = problemData.Dataset; 102 103 IEnumerable<int> rows; 104 IEnumerable<double> targetValues; 105 double originalAccuracy; 106 107 OnlineCalculatorError error; 108 109 switch (data) { 110 case DataPartitionEnum.All: 111 rows = problemData.AllIndices; 112 targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.AllIndices).ToList(); 113 originalAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, solution.EstimatedClassValues, out error); 114 if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during accuracy calculation."); 115 break; 116 case DataPartitionEnum.Training: 117 rows = problemData.TrainingIndices; 118 targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToList(); 119 originalAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, solution.EstimatedTrainingClassValues, out error); 120 if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during accuracy calculation."); 121 break; 122 case DataPartitionEnum.Test: 123 rows = problemData.TestIndices; 124 targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToList(); 125 originalAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, solution.EstimatedTestClassValues, out error); 126 if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during accuracy calculation."); 127 break; 128 default: throw new ArgumentException(string.Format("DataPartition {0} cannot be handled.", data)); 129 } 133 130 134 131 var impacts = new Dictionary<string, double>(); 135 var inputvariables = new HashSet<string>(problemData.AllowedInputVariables.Union(model.VariablesUsedForPrediction)); 136 var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable(); 137 138 foreach (var inputVariable in inputvariables) { 139 impacts[inputVariable] = CalculateImpact(inputVariable, model, problemData, modifiableDataset, rows, replacementMethod, factorReplacementMethod, targetValues, originalQuality); 140 } 141 142 return impacts.Select(i => Tuple.Create(i.Key, i.Value)); 143 } 144 145 public static double CalculateImpact(string variableName, 146 IClassificationModel model, 147 IClassificationProblemData problemData, 148 ModifiableDataset modifiableDataset, 149 IEnumerable<int> rows, 150 ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, 151 FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best, 152 IEnumerable<double> targetValues = null, 153 double quality = double.NaN) { 154 155 if (!model.VariablesUsedForPrediction.Contains(variableName)) { return 0.0; } 156 if (!problemData.Dataset.VariableNames.Contains(variableName)) { 157 throw new InvalidOperationException(string.Format("Can not calculate variable impact, because the model uses inputs missing in the dataset ({0})", variableName)); 158 } 159 160 if (targetValues == null) { 161 targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); 162 } 163 if (quality == double.NaN) { 164 quality = CalculateQuality(model.GetEstimatedClassValues(modifiableDataset, rows), targetValues); 165 } 166 167 IList originalValues = null; 168 IList replacementValues = GetReplacementValues(modifiableDataset, variableName, model, rows, targetValues, out originalValues, replacementMethod, factorReplacementMethod); 169 170 double newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, replacementValues, targetValues); 171 double impact = quality - newValue; 172 173 return impact; 174 } 175 176 private static IList GetReplacementValues(ModifiableDataset modifiableDataset, 177 string variableName, 178 IClassificationModel model, 179 IEnumerable<int> rows, 180 IEnumerable<double> targetValues, 181 out IList originalValues, 182 ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, 183 FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { 184 185 IList replacementValues = null; 186 if (modifiableDataset.VariableHasType<double>(variableName)) { 187 originalValues = modifiableDataset.GetReadOnlyDoubleValues(variableName).ToList(); 188 replacementValues = GetReplacementValuesForDouble(modifiableDataset, rows, (List<double>)originalValues, replacementMethod); 189 } else if (modifiableDataset.VariableHasType<string>(variableName)) { 190 originalValues = modifiableDataset.GetReadOnlyStringValues(variableName).ToList(); 191 replacementValues = GetReplacementValuesForString(model, modifiableDataset, variableName, rows, (List<string>)originalValues, targetValues, factorReplacementMethod); 192 } else { 193 throw new NotSupportedException("Variable not supported"); 194 } 195 196 return replacementValues; 197 } 198 199 private static IList GetReplacementValuesForDouble(ModifiableDataset modifiableDataset, 200 IEnumerable<int> rows, 201 List<double> originalValues, 202 ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle) { 203 204 IRandom random = new FastRandom(31415); 132 var modifiableDataset = ((Dataset)dataset).ToModifiable(); 133 134 var inputvariables = new HashSet<string>(problemData.AllowedInputVariables.Union(solution.Model.VariablesUsedForPrediction)); 135 var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList(); 136 137 // calculate impacts for double variables 138 foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) { 139 var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod); 140 var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error); 141 if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs."); 142 143 impacts[inputVariable] = originalAccuracy - newAccuracy; 144 } 145 146 // calculate impacts for string variables 147 foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) { 148 if (factorReplacementMethod == FactorReplacementMethodEnum.Best) { 149 // try replacing with all possible values and find the best replacement value 150 var smallestImpact = double.PositiveInfinity; 151 foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) { 152 var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, 153 Enumerable.Repeat(repl, dataset.Rows)); 154 var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error); 155 if (error != OnlineCalculatorError.None) 156 throw new InvalidOperationException("Error during accuracy calculation with replaced inputs."); 157 158 var impact = originalAccuracy - newAccuracy; 159 if (impact < smallestImpact) smallestImpact = impact; 160 } 161 impacts[inputVariable] = smallestImpact; 162 } else { 163 // for replacement methods shuffle and mode 164 // calculate impacts for factor variables 165 166 var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, 167 factorReplacementMethod); 168 var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error); 169 if (error != OnlineCalculatorError.None) 170 throw new InvalidOperationException("Error during accuracy calculation with replaced inputs."); 171 172 impacts[inputVariable] = originalAccuracy - newAccuracy; 173 } 174 } // foreach 175 return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value)); 176 } 177 178 private static IEnumerable<double> EvaluateModelWithReplacedVariable(IClassificationModel model, string variable, ModifiableDataset dataset, IEnumerable<int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) { 179 var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); 180 double replacementValue; 205 181 List<double> replacementValues; 206 double replacementValue;207 208 switch (replacement Method) {182 IRandom rand; 183 184 switch (replacement) { 209 185 case ReplacementMethodEnum.Median: 210 186 replacementValue = rows.Select(r => originalValues[r]).Median(); 211 replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();187 replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); 212 188 break; 213 189 case ReplacementMethodEnum.Average: 214 190 replacementValue = rows.Select(r => originalValues[r]).Average(); 215 replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();191 replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); 216 192 break; 217 193 case ReplacementMethodEnum.Shuffle: 218 194 // new var has same empirical distribution but the relation to y is broken 195 rand = new FastRandom(31415); 219 196 // prepare a complete column for the dataset 220 replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();197 replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList(); 221 198 // shuffle only the selected rows 222 var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand om).ToList();199 var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); 223 200 int i = 0; 224 201 // update column values … … 230 207 var avg = rows.Select(r => originalValues[r]).Average(); 231 208 var stdDev = rows.Select(r => originalValues[r]).StandardDeviation(); 209 rand = new FastRandom(31415); 232 210 // prepare a complete column for the dataset 233 replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();211 replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList(); 234 212 // update column values 235 213 foreach (var r in rows) { 236 replacementValues[r] = NormalDistributedRandom.NextDouble(rand om, avg, stdDev);214 replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev); 237 215 } 238 216 break; 239 217 240 218 default: 241 throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacementMethod)); 242 } 243 244 return replacementValues; 245 } 246 247 private static IList GetReplacementValuesForString(IClassificationModel model, 248 ModifiableDataset modifiableDataset, 249 string variableName, 219 throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement)); 220 } 221 222 return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues); 223 } 224 225 private static IEnumerable<double> EvaluateModelWithReplacedVariable( 226 IClassificationModel model, string variable, ModifiableDataset dataset, 250 227 IEnumerable<int> rows, 251 List<string> originalValues, 252 IEnumerable<double> targetValues, 253 FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Shuffle) { 254 255 List<string> replacementValues = null; 256 IRandom random = new FastRandom(31415); 257 258 switch (factorReplacementMethod) { 259 case FactorReplacementMethodEnum.Best: 260 // try replacing with all possible values and find the best replacement value 261 var bestQuality = double.NegativeInfinity; 262 foreach (var repl in modifiableDataset.GetStringValues(variableName, rows).Distinct()) { 263 List<string> curReplacementValues = Enumerable.Repeat(repl, modifiableDataset.Rows).ToList(); 264 //fholzing: this result could be used later on (theoretically), but is neglected for better readability/method consistency 265 var newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, curReplacementValues, targetValues); 266 var curQuality = newValue; 267 268 if (curQuality > bestQuality) { 269 bestQuality = curQuality; 270 replacementValues = curReplacementValues; 271 } 272 } 273 break; 228 FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) { 229 var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); 230 List<string> replacementValues; 231 IRandom rand; 232 233 switch (replacement) { 274 234 case FactorReplacementMethodEnum.Mode: 275 235 var mostCommonValue = rows.Select(r => originalValues[r]) … … 277 237 .OrderByDescending(g => g.Count()) 278 238 .First().Key; 279 replacementValues = Enumerable.Repeat(mostCommonValue, modifiableDataset.Rows).ToList();239 replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList(); 280 240 break; 281 241 case FactorReplacementMethodEnum.Shuffle: 282 242 // new var has same empirical distribution but the relation to y is broken 243 rand = new FastRandom(31415); 283 244 // prepare a complete column for the dataset 284 replacementValues = Enumerable.Repeat(string.Empty, modifiableDataset.Rows).ToList();245 replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList(); 285 246 // shuffle only the selected rows 286 var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand om).ToList();247 var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); 287 248 int i = 0; 288 249 // update column values … … 292 253 break; 293 254 default: 294 throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", factorReplacementMethod)); 295 } 296 297 return replacementValues; 298 } 299 300 private static double CalculateQualityForReplacement( 301 IClassificationModel model, 302 ModifiableDataset modifiableDataset, 303 string variableName, 304 IList originalValues, 305 IEnumerable<int> rows, 306 IList replacementValues, 307 IEnumerable<double> targetValues) { 308 309 modifiableDataset.ReplaceVariable(variableName, replacementValues); 310 var discModel = model as IDiscriminantFunctionClassificationModel; 311 if (discModel != null) { 312 var problemData = new ClassificationProblemData(modifiableDataset, modifiableDataset.VariableNames, model.TargetVariable); 313 discModel.RecalculateModelParameters(problemData, rows); 314 } 315 255 throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement)); 256 } 257 258 return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues); 259 } 260 261 private static IEnumerable<double> EvaluateModelWithReplacedVariable(IClassificationModel model, string variable, 262 ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<double> replacementValues) { 263 var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); 264 dataset.ReplaceVariable(variable, replacementValues.ToList()); 316 265 //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements 317 var estimates = model.GetEstimatedClassValues(modifiableDataset, rows).ToList(); 318 var ret = CalculateQuality(targetValues, estimates); 319 modifiableDataset.ReplaceVariable(variableName, originalValues); 320 321 return ret; 322 } 323 324 public static double CalculateQuality(IEnumerable<double> targetValues, IEnumerable<double> estimatedClassValues) { 325 OnlineCalculatorError errorState; 326 var ret = OnlineAccuracyCalculator.Calculate(targetValues, estimatedClassValues, out errorState); 327 if (errorState != OnlineCalculatorError.None) { throw new InvalidOperationException("Error during calculation with replaced inputs."); } 328 return ret; 329 } 330 331 public static IEnumerable<int> GetPartitionRows(DataPartitionEnum dataPartition, IClassificationProblemData problemData) { 332 IEnumerable<int> rows; 333 334 switch (dataPartition) { 335 case DataPartitionEnum.All: 336 rows = problemData.AllIndices; 337 break; 338 case DataPartitionEnum.Test: 339 rows = problemData.TestIndices; 340 break; 341 case DataPartitionEnum.Training: 342 rows = problemData.TrainingIndices; 343 break; 344 default: 345 throw new NotSupportedException("DataPartition not supported"); 346 } 347 348 return rows; 266 var estimates = model.GetEstimatedClassValues(dataset, rows).ToList(); 267 dataset.ReplaceVariable(variable, originalValues); 268 269 return estimates; 270 } 271 private static IEnumerable<double> EvaluateModelWithReplacedVariable(IClassificationModel model, string variable, 272 ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<string> replacementValues) { 273 var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); 274 dataset.ReplaceVariable(variable, replacementValues.ToList()); 275 //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements 276 var estimates = model.GetEstimatedClassValues(dataset, rows).ToList(); 277 dataset.ReplaceVariable(variable, originalValues); 278 279 return estimates; 349 280 } 350 281 } -
branches/2904_CalculateImpacts/3.4/Implementation/DataAnalysisProblemData.cs
r15583 r16188 163 163 164 164 var variables = dataset.VariableNames.Where(variable => dataset.VariableHasType<double>(variable) || dataset.VariableHasType<string>(variable)); 165 var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x) ));165 var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x).AsReadOnly())); 166 166 foreach (StringValue x in inputVariables) 167 167 inputVariables.SetItemCheckedState(x, allowedInputVariables.Contains(x.Value)); -
branches/2904_CalculateImpacts/3.4/ModifiableDataset.cs
r15769 r16188 39 39 40 40 private ModifiableDataset(ModifiableDataset original, Cloner cloner) : base(original, cloner) { 41 var variables = variableValues.Keys.ToList(); 42 foreach (var v in variables) { 43 var type = GetVariableType(v); 44 if (type == typeof(DateTime)) { 45 variableValues[v] = GetDateTimeValues(v).ToList(); 46 } else if (type == typeof(double)) { 47 variableValues[v] = GetDoubleValues(v).ToList(); 48 } else if (type == typeof(string)) { 49 variableValues[v] = GetStringValues(v).ToList(); 50 } else { 51 throw new ArgumentException("Unsupported type " + type + " for variable " + v); 41 variableNames = new List<string>(original.variableNames); 42 variableValues = CloneValues(original.variableValues); 43 } 44 45 public override IDeepCloneable Clone(Cloner cloner) { return new ModifiableDataset(this, cloner); } 46 47 public ModifiableDataset() { } 48 49 public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues, bool cloneValues = false) : 50 base(variableNames, variableValues, cloneValues) { } 51 52 public Dataset ToDataset() { 53 return new Dataset(variableNames, variableNames.Select(v => variableValues[v])); 54 } 55 56 57 public IEnumerable<object> GetRow(int row) { 58 return variableValues.Select(x => x.Value[row]); 59 } 60 61 public void AddRow(IEnumerable<object> values) { 62 var list = values.ToList(); 63 if (list.Count != variableNames.Count) 64 throw new ArgumentException("The number of values must be equal to the number of variable names."); 65 // check if all the values are of the correct type 66 for (int i = 0; i < list.Count; ++i) { 67 if (list[i].GetType() != GetVariableType(variableNames[i])) { 68 throw new ArgumentException("The type of the provided value does not match the variable type."); 52 69 } 53 70 } 54 } 55 public override IDeepCloneable Clone(Cloner cloner) { return new ModifiableDataset(this, cloner); } 56 public ModifiableDataset() : base() { } 57 58 public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) : base(variableNames, variableValues) { } 71 // add values 72 for (int i = 0; i < list.Count; ++i) { 73 variableValues[variableNames[i]].Add(list[i]); 74 } 75 Rows++; 76 OnRowsChanged(); 77 OnReset(); 78 } 59 79 60 80 public void ReplaceRow(int row, IEnumerable<object> values) { … … 72 92 variableValues[variableNames[i]][row] = list[i]; 73 93 } 94 OnReset(); 95 } 96 97 // slow, avoid using this 98 public void RemoveRow(int row) { 99 foreach (var list in variableValues.Values) 100 list.RemoveAt(row); 101 Rows--; 102 OnRowsChanged(); 103 OnReset(); 104 } 105 106 // adds a new variable to the dataset 107 public void AddVariable(string variableName, IList values) { 108 InsertVariable(variableName, Columns, values); 109 } 110 111 public void InsertVariable(string variableName, int position, IList values) { 112 if (variableValues.ContainsKey(variableName)) 113 throw new ArgumentException(string.Format("Variable {0} is already present in the dataset.", variableName)); 114 115 if (position < 0 || position > Columns) 116 throw new ArgumentException(string.Format("Incorrect position {0} specified. The position must be between 0 and {1}.", position, Columns)); 117 118 if (values == null) 119 throw new ArgumentNullException("values", "Values must not be null. At least an empty list of values has to be provided."); 120 121 if (values.Count != Rows) 122 throw new ArgumentException(string.Format("{0} values are provided, but {1} rows are present in the dataset.", values.Count, Rows)); 123 124 if (!IsAllowedType(values)) 125 throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName)); 126 127 variableNames.Insert(position, variableName); 128 variableValues[variableName] = values; 129 130 OnColumnsChanged(); 131 OnColumnNamesChanged(); 74 132 OnReset(); 75 133 } … … 85 143 } 86 144 87 public void AddRow(IEnumerable<object> values) {88 var list = values.ToList();89 if (list.Count != variableNames.Count)90 throw new ArgumentException("The number of values must be equal to the number of variable names.");91 // check if all the values are of the correct type92 for (int i = 0; i < list.Count; ++i) {93 if (list[i].GetType() != GetVariableType(variableNames[i])) {94 throw new ArgumentException("The type of the provided value does not match the variable type.");95 }96 }97 // add values98 for (int i = 0; i < list.Count; ++i) {99 variableValues[variableNames[i]].Add(list[i]);100 }101 rows++;102 OnRowsChanged();103 OnReset();104 }105 106 // adds a new variable to the dataset107 public void AddVariable(string variableName, IList values) {108 if (variableValues.ContainsKey(variableName))109 throw new ArgumentException(string.Format("Variable {0} is already present in the dataset.", variableName));110 111 if (values == null || values.Count == 0)112 throw new ArgumentException("Cannot add variable with no values.");113 114 if (!IsAllowedType(values))115 throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName));116 117 variableValues[variableName] = values;118 variableNames.Add(variableName);119 120 OnColumnsChanged();121 OnColumnNamesChanged();122 OnReset();123 }124 145 125 146 public void RemoveVariable(string variableName) { … … 133 154 } 134 155 135 // slow, avoid using this136 public void RemoveRow(int row) {137 foreach (var list in variableValues.Values)138 list.RemoveAt(row);139 rows--;156 public void ClearValues() { 157 foreach (var list in variableValues.Values) { 158 list.Clear(); 159 } 160 Rows = 0; 140 161 OnRowsChanged(); 141 162 OnReset(); 142 163 } 164 143 165 144 166 public void SetVariableValue(object value, string variableName, int row) { -
branches/2904_CalculateImpacts/HeuristicLab.Problems.DataAnalysis.Views/3.4
-
Property
svn:mergeinfo
set to
(toggle deleted branches)
/stable/HeuristicLab.Problems.DataAnalysis.Views/3.4 merged eligible /trunk/HeuristicLab.Problems.DataAnalysis.Views/3.4 merged eligible /branches/Async/HeuristicLab.Problems.DataAnalysis.Views/3.4 13329-15286 /branches/Benchmarking/sources/HeuristicLab.Problems.DataAnalysis.Views/3.4 6917-7005 /branches/ClassificationModelComparison/HeuristicLab.Problems.DataAnalysis.Views/3.4 9116-13099 /branches/CloningRefactoring/HeuristicLab.Problems.DataAnalysis.Views/3.4 4656-4721 /branches/DataAnalysis Refactoring/HeuristicLab.Problems.DataAnalysis.Views/3.4 5471-5808 /branches/DataAnalysis SolutionEnsembles/HeuristicLab.Problems.DataAnalysis.Views/3.4 5815-6180 /branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Views/3.4 4458-4459,4462,4464 /branches/DataPreprocessing/HeuristicLab.Problems.DataAnalysis.Views/3.4 10085-11101 /branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis.Views/3.4 8036-8538 /branches/GP.Grammar.Editor/HeuristicLab.Problems.DataAnalysis.Views/3.4 6284-6795 /branches/GP.Symbols (TimeLag, Diff, Integral)/HeuristicLab.Problems.DataAnalysis.Views/3.4 5060 /branches/HeuristicLab.DatasetRefactor/sources/HeuristicLab.Problems.DataAnalysis.Views/3.4 11570-12508 /branches/HeuristicLab.Problems.Orienteering/HeuristicLab.Problems.DataAnalysis.Views/3.4 11130-12721 /branches/HeuristicLab.RegressionSolutionGradientView/HeuristicLab.Problems.DataAnalysis.Views/3.4 13780-14091 /branches/HeuristicLab.TimeSeries/HeuristicLab.Problems.DataAnalysis.Views/3.4 7098-8789 /branches/NET40/sources/HeuristicLab.Problems.DataAnalysis.Views/3.4 5138-5162 /branches/ParallelEngine/HeuristicLab.Problems.DataAnalysis.Views/3.4 5175-5192 /branches/ProblemInstancesRegressionAndClassification/HeuristicLab.Problems.DataAnalysis.Views/3.4 7568-7810 /branches/QAPAlgorithms/HeuristicLab.Problems.DataAnalysis.Views/3.4 6350-6627 /branches/Restructure trunk solution/HeuristicLab.Problems.DataAnalysis.Views/3.4 6828 /branches/SimplifierViewsProgress/HeuristicLab.Problems.DataAnalysis.Views/3.4 15318-15370 /branches/SuccessProgressAnalysis/HeuristicLab.Problems.DataAnalysis.Views/3.4 5370-5682 /branches/Trunk/HeuristicLab.Problems.DataAnalysis.Views/3.4 6829-6865 /branches/VNS/HeuristicLab.Problems.DataAnalysis.Views/3.4 5594-5752 /branches/histogram/HeuristicLab.Problems.DataAnalysis.Views/3.4 5959-6341 /branches/symbreg-factors-2650/HeuristicLab.Problems.DataAnalysis.Views/3.4 14232-14825
-
Property
svn:mergeinfo
set to
(toggle deleted branches)
-
branches/2904_CalculateImpacts/HeuristicLab.Tests
- Property svn:mergeinfo changed
/branches/2839_HiveProjectManagement/HeuristicLab.Tests (added) merged: 16057 /trunk/HeuristicLab.Tests (added) merged: 16117
- Property svn:mergeinfo changed
Note: See TracChangeset
for help on using the changeset viewer.