- Timestamp:
- 09/14/18 11:47:37 (6 years ago)
- Location:
- branches/2817-BinPackingSpeedup
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2817-BinPackingSpeedup
- Property svn:mergeinfo changed
-
branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis
- Property svn:mergeinfo changed
-
branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/Dataset.cs
r16140 r16141 38 38 protected Dataset(Dataset original, Cloner cloner) 39 39 : base(original, cloner) { 40 // no need to clone the variable values because these can't be modified 40 41 variableValues = new Dictionary<string, IList>(original.variableValues); 41 42 variableNames = new List<string>(original.variableNames); 42 43 rows = original.rows; 43 44 } 45 44 46 public override IDeepCloneable Clone(Cloner cloner) { return new Dataset(this, cloner); } 45 47 … … 58 60 /// <param name="variableValues">The values for the variables (column-oriented storage). Values are not cloned!</param> 59 61 public Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) 60 : base() { 62 : this(variableNames, variableValues, cloneValues: true) { 63 } 64 65 protected Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues, bool cloneValues = false) { 61 66 Name = "-"; 62 if (!variableNames.Any()) { 67 68 if (variableNames.Any()) { 69 this.variableNames = new List<string>(variableNames); 70 } else { 63 71 this.variableNames = Enumerable.Range(0, variableValues.Count()).Select(x => "Column " + x).ToList(); 64 } else if (variableNames.Count() != variableValues.Count()) { 65 throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues"); 66 } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) { 67 throw new ArgumentException("The number of values must be equal for every variable"); 68 } else if (variableNames.Distinct().Count() != variableNames.Count()) { 69 var duplicateVariableNames = 70 variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList(); 71 string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine; 72 foreach (var duplicateVariableName in duplicateVariableNames) 73 message += duplicateVariableName + Environment.NewLine; 74 throw new ArgumentException(message); 75 } 72 } 73 // check if the arguments are consistent (no duplicate variables, same number of rows, correct data types, ...) 74 CheckArguments(this.variableNames, variableValues); 75 76 76 rows = variableValues.First().Count; 77 this.variableNames = new List<string>(variableNames); 78 this.variableValues = new Dictionary<string, IList>(this.variableNames.Count); 79 for (int i = 0; i < this.variableNames.Count; i++) { 80 var values = variableValues.ElementAt(i); 81 this.variableValues.Add(this.variableNames[i], values); 77 78 if (cloneValues) { 79 this.variableValues = CloneValues(this.variableNames, variableValues); 80 } else { 81 this.variableValues = new Dictionary<string, IList>(this.variableNames.Count); 82 for (int i = 0; i < this.variableNames.Count; i++) { 83 var variableName = this.variableNames[i]; 84 var values = variableValues.ElementAt(i); 85 this.variableValues.Add(variableName, values); 86 } 82 87 } 83 88 } … … 111 116 112 117 public ModifiableDataset ToModifiable() { 113 var values = new List<IList>(); 114 foreach (var v in variableNames) { 115 if (VariableHasType<double>(v)) { 116 values.Add(new List<double>((IList<double>)variableValues[v])); 117 } else if (VariableHasType<string>(v)) { 118 values.Add(new List<string>((IList<string>)variableValues[v])); 119 } else if (VariableHasType<DateTime>(v)) { 120 values.Add(new List<DateTime>((IList<DateTime>)variableValues[v])); 121 } else { 122 throw new ArgumentException("Unknown variable type."); 123 } 124 } 125 return new ModifiableDataset(variableNames, values); 126 } 118 return new ModifiableDataset(variableNames, variableNames.Select(v => variableValues[v]), true); 119 } 120 127 121 /// <summary> 128 122 /// Shuffle a dataset's rows … … 135 129 } 136 130 137 protected Dataset(Dataset dataset) : this(dataset.variableNames, dataset.variableValues.Values) { } 131 138 132 139 133 #region Backwards compatible code, remove with 3.5 … … 231 225 return new ReadOnlyCollection<DateTime>(values); 232 226 } 233 234 235 227 private IEnumerable<T> GetValues<T>(string variableName, IEnumerable<int> rows) { 236 228 var values = GetValues<T>(variableName); … … 248 240 return variableValues[variableName] is IList<T>; 249 241 } 242 protected Type GetVariableType(string variableName) { 243 IList list; 244 variableValues.TryGetValue(variableName, out list); 245 if (list == null) 246 throw new ArgumentException("The variable " + variableName + " does not exist in the dataset."); 247 return GetElementType(list); 248 } 249 protected static Type GetElementType(IList list) { 250 var type = list.GetType(); 251 return type.IsGenericType ? type.GetGenericArguments()[0] : type.GetElementType(); 252 } 253 protected static bool IsAllowedType(IList list) { 254 var type = GetElementType(list); 255 return IsAllowedType(type); 256 } 257 protected static bool IsAllowedType(Type type) { 258 return type == typeof(double) || type == typeof(string) || type == typeof(DateTime); 259 } 260 261 protected static void CheckArguments(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) { 262 if (variableNames.Count() != variableValues.Count()) { 263 throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues"); 264 } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) { 265 throw new ArgumentException("The number of values must be equal for every variable"); 266 } else if (variableNames.Distinct().Count() != variableNames.Count()) { 267 var duplicateVariableNames = 268 variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList(); 269 string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine; 270 foreach (var duplicateVariableName in duplicateVariableNames) 271 message += duplicateVariableName + Environment.NewLine; 272 throw new ArgumentException(message); 273 } 274 // check if all the variables are supported 275 foreach (var t in variableNames.Zip(variableValues, Tuple.Create)) { 276 var variableName = t.Item1; 277 var values = t.Item2; 278 279 if (!IsAllowedType(values)) { 280 throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName)); 281 } 282 } 283 } 284 285 protected static Dictionary<string, IList> CloneValues(Dictionary<string, IList> variableValues) { 286 return variableValues.ToDictionary(x => x.Key, x => CloneValues(x.Value)); 287 } 288 289 protected static Dictionary<string, IList> CloneValues(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) { 290 return variableNames.Zip(variableValues, Tuple.Create).ToDictionary(x => x.Item1, x => CloneValues(x.Item2)); 291 } 292 293 protected static IList CloneValues(IList values) { 294 var doubleValues = values as IList<double>; 295 if (doubleValues != null) return new List<double>(doubleValues); 296 297 var stringValues = values as IList<string>; 298 if (stringValues != null) return new List<string>(stringValues); 299 300 var dateTimeValues = values as IList<DateTime>; 301 if (dateTimeValues != null) return new List<DateTime>(dateTimeValues); 302 303 throw new ArgumentException(string.Format("Unsupported variable type {0}.", GetElementType(values))); 304 } 250 305 251 306 #region IStringConvertibleMatrix Members 252 307 [Storable] 253 pr otectedint rows;308 private int rows; 254 309 public int Rows { 255 310 get { return rows; } 311 protected set { rows = value; } 256 312 } 257 313 int IStringConvertibleMatrix.Rows { -
branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ClassificationSolutionVariableImpactsCalculator.cs
r16140 r16141 100 100 var problemData = solution.ProblemData; 101 101 var dataset = problemData.Dataset; 102 var model = (IClassificationModel)solution.Model.Clone(); //mkommend: clone of model is necessary, because the thresholds for IDiscriminantClassificationModels are updated 102 103 103 104 IEnumerable<int> rows; … … 137 138 // calculate impacts for double variables 138 139 foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) { 139 var newEstimates = EvaluateModelWithReplacedVariable( solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);140 var newEstimates = EvaluateModelWithReplacedVariable(model, inputVariable, modifiableDataset, rows, replacementMethod); 140 141 var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error); 141 142 if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs."); … … 150 151 var smallestImpact = double.PositiveInfinity; 151 152 foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) { 152 var newEstimates = EvaluateModelWithReplacedVariable( solution.Model, inputVariable, modifiableDataset, rows,153 var newEstimates = EvaluateModelWithReplacedVariable(model, inputVariable, modifiableDataset, rows, 153 154 Enumerable.Repeat(repl, dataset.Rows)); 154 155 var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error); … … 164 165 // calculate impacts for factor variables 165 166 166 var newEstimates = EvaluateModelWithReplacedVariable( solution.Model, inputVariable, modifiableDataset, rows,167 var newEstimates = EvaluateModelWithReplacedVariable(model, inputVariable, modifiableDataset, rows, 167 168 factorReplacementMethod); 168 169 var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error); … … 263 264 var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); 264 265 dataset.ReplaceVariable(variable, replacementValues.ToList()); 266 267 var discModel = model as IDiscriminantFunctionClassificationModel; 268 if (discModel != null) { 269 var problemData = new ClassificationProblemData(dataset, dataset.VariableNames, model.TargetVariable); 270 discModel.RecalculateModelParameters(problemData, rows); 271 } 272 265 273 //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements 266 274 var estimates = model.GetEstimatedClassValues(dataset, rows).ToList(); … … 273 281 var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); 274 282 dataset.ReplaceVariable(variable, replacementValues.ToList()); 283 284 285 var discModel = model as IDiscriminantFunctionClassificationModel; 286 if (discModel != null) { 287 var problemData = new ClassificationProblemData(dataset, dataset.VariableNames, model.TargetVariable); 288 discModel.RecalculateModelParameters(problemData, rows); 289 } 290 275 291 //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements 276 292 var estimates = model.GetEstimatedClassValues(dataset, rows).ToList(); -
branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/DataAnalysisProblemData.cs
r16140 r16141 163 163 164 164 var variables = dataset.VariableNames.Where(variable => dataset.VariableHasType<double>(variable) || dataset.VariableHasType<string>(variable)); 165 var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x) ));165 var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x).AsReadOnly())); 166 166 foreach (StringValue x in inputVariables) 167 167 inputVariables.SetItemCheckedState(x, allowedInputVariables.Contains(x.Value)); -
branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionVariableImpactsCalculator.cs
r16140 r16141 52 52 All 53 53 } 54 54 55 55 private const string ReplacementParameterName = "Replacement Method"; 56 56 private const string DataPartitionParameterName = "DataPartition"; … … 96 96 DataPartitionEnum data = DataPartitionEnum.Training, 97 97 ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median, 98 FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { 98 FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best, 99 Func<double, string, bool> progressCallback = null) { 99 100 100 101 var problemData = solution.ProblemData; … … 134 135 var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList(); 135 136 137 int curIdx = 0; 138 int count = allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>).Count(); 136 139 // calculate impacts for double variables 137 140 foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) { 141 //Report the current progress in percent. If the callback returns true, it means the execution shall be stopped 142 if (progressCallback != null) { 143 curIdx++; 144 if (progressCallback((double)curIdx / count, string.Format("Calculating impact for variable {0} ({1} of {2})", inputVariable, curIdx, count))) { return null; } 145 } 138 146 var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod); 139 147 var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error); … … 180 188 } 181 189 190 182 191 private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable<int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) { 183 192 var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); -
branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/ModifiableDataset.cs
r16140 r16141 39 39 40 40 private ModifiableDataset(ModifiableDataset original, Cloner cloner) : base(original, cloner) { 41 var variables = variableValues.Keys.ToList(); 42 foreach (var v in variables) { 43 var type = GetVariableType(v); 44 if (type == typeof(DateTime)) { 45 variableValues[v] = GetDateTimeValues(v).ToList(); 46 } else if (type == typeof(double)) { 47 variableValues[v] = GetDoubleValues(v).ToList(); 48 } else if (type == typeof(string)) { 49 variableValues[v] = GetStringValues(v).ToList(); 50 } else { 51 throw new ArgumentException("Unsupported type " + type + " for variable " + v); 41 variableNames = new List<string>(original.variableNames); 42 variableValues = CloneValues(original.variableValues); 43 } 44 45 public override IDeepCloneable Clone(Cloner cloner) { return new ModifiableDataset(this, cloner); } 46 47 public ModifiableDataset() { } 48 49 public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues, bool cloneValues = false) : 50 base(variableNames, variableValues, cloneValues) { } 51 52 public Dataset ToDataset() { 53 return new Dataset(variableNames, variableNames.Select(v => variableValues[v])); 54 } 55 56 57 public IEnumerable<object> GetRow(int row) { 58 return variableValues.Select(x => x.Value[row]); 59 } 60 61 public void AddRow(IEnumerable<object> values) { 62 var list = values.ToList(); 63 if (list.Count != variableNames.Count) 64 throw new ArgumentException("The number of values must be equal to the number of variable names."); 65 // check if all the values are of the correct type 66 for (int i = 0; i < list.Count; ++i) { 67 if (list[i].GetType() != GetVariableType(variableNames[i])) { 68 throw new ArgumentException("The type of the provided value does not match the variable type."); 52 69 } 53 70 } 54 } 55 public override IDeepCloneable Clone(Cloner cloner) { return new ModifiableDataset(this, cloner); } 56 public ModifiableDataset() : base() { } 57 58 public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) : base(variableNames, variableValues) { } 71 // add values 72 for (int i = 0; i < list.Count; ++i) { 73 variableValues[variableNames[i]].Add(list[i]); 74 } 75 Rows++; 76 OnRowsChanged(); 77 OnReset(); 78 } 59 79 60 80 public void ReplaceRow(int row, IEnumerable<object> values) { … … 72 92 variableValues[variableNames[i]][row] = list[i]; 73 93 } 94 OnReset(); 95 } 96 97 // slow, avoid using this 98 public void RemoveRow(int row) { 99 foreach (var list in variableValues.Values) 100 list.RemoveAt(row); 101 Rows--; 102 OnRowsChanged(); 103 OnReset(); 104 } 105 106 // adds a new variable to the dataset 107 public void AddVariable(string variableName, IList values) { 108 InsertVariable(variableName, Columns, values); 109 } 110 111 public void InsertVariable(string variableName, int position, IList values) { 112 if (variableValues.ContainsKey(variableName)) 113 throw new ArgumentException(string.Format("Variable {0} is already present in the dataset.", variableName)); 114 115 if (position < 0 || position > Columns) 116 throw new ArgumentException(string.Format("Incorrect position {0} specified. The position must be between 0 and {1}.", position, Columns)); 117 118 if (values == null) 119 throw new ArgumentNullException("values", "Values must not be null. At least an empty list of values has to be provided."); 120 121 if (values.Count != Rows) 122 throw new ArgumentException(string.Format("{0} values are provided, but {1} rows are present in the dataset.", values.Count, Rows)); 123 124 if (!IsAllowedType(values)) 125 throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName)); 126 127 variableNames.Insert(position, variableName); 128 variableValues[variableName] = values; 129 130 OnColumnsChanged(); 131 OnColumnNamesChanged(); 74 132 OnReset(); 75 133 } … … 85 143 } 86 144 87 public void AddRow(IEnumerable<object> values) {88 var list = values.ToList();89 if (list.Count != variableNames.Count)90 throw new ArgumentException("The number of values must be equal to the number of variable names.");91 // check if all the values are of the correct type92 for (int i = 0; i < list.Count; ++i) {93 if (list[i].GetType() != GetVariableType(variableNames[i])) {94 throw new ArgumentException("The type of the provided value does not match the variable type.");95 }96 }97 // add values98 for (int i = 0; i < list.Count; ++i) {99 variableValues[variableNames[i]].Add(list[i]);100 }101 rows++;102 OnRowsChanged();103 OnReset();104 }105 106 // adds a new variable to the dataset107 public void AddVariable<T>(string variableName, IEnumerable<T> values) {108 if (variableValues.ContainsKey(variableName))109 throw new ArgumentException("Variable " + variableName + " is already present in the dataset.");110 int count = values.Count();111 if (count != rows)112 throw new ArgumentException("The number of values must exactly match the number of rows in the dataset.");113 variableValues[variableName] = new List<T>(values);114 variableNames.Add(variableName);115 OnColumnsChanged();116 OnColumnNamesChanged();117 OnReset();118 }119 145 120 146 public void RemoveVariable(string variableName) { 121 147 if (!variableValues.ContainsKey(variableName)) 122 throw new ArgumentException( "The variable " + variableName + " does not exist in the dataset.");148 throw new ArgumentException(string.Format("The variable {0} does not exist in the dataset.", variableName)); 123 149 variableValues.Remove(variableName); 124 150 variableNames.Remove(variableName); … … 128 154 } 129 155 130 // slow, avoid to use this131 public void RemoveRow(int row) {132 foreach (var list in variableValues.Values)133 list.RemoveAt(row);134 rows--;156 public void ClearValues() { 157 foreach (var list in variableValues.Values) { 158 list.Clear(); 159 } 160 Rows = 0; 135 161 OnRowsChanged(); 136 162 OnReset(); 137 163 } 164 138 165 139 166 public void SetVariableValue(object value, string variableName, int row) { … … 151 178 } 152 179 153 private Type GetVariableType(string variableName) {154 IList list;155 variableValues.TryGetValue(variableName, out list);156 if (list == null)157 throw new ArgumentException("The variable " + variableName + " does not exist in the dataset.");158 return list.GetType().GetGenericArguments()[0];159 }160 161 180 bool IStringConvertibleMatrix.SetValue(string value, int rowIndex, int columnIndex) { 162 181 var variableName = variableNames[columnIndex];
Note: See TracChangeset
for help on using the changeset viewer.