- Timestamp:
- 10/25/17 12:38:12 (7 years ago)
- Location:
- branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data
- Files:
-
- 1 deleted
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/FilteredPreprocessingData.cs
r15309 r15431 38 38 private IPreprocessingData filteredData; 39 39 40 public IList<PreprocessingDataColumn> DataColumns {41 get { return ActiveData.DataColumns; }42 }43 44 40 public IPreprocessingData ActiveData { 45 41 get { return IsFiltered ? filteredData : originalData; } … … 86 82 } 87 83 88 public I Enumerable<T> GetValues<T>(int columnIndex, bool considerSelection) {84 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 89 85 return ActiveData.GetValues<T>(columnIndex, considerSelection); 90 86 } 91 87 92 public void SetValues<T>(int columnIndex, I Enumerable<T> values) {88 public void SetValues<T>(int columnIndex, IList<T> values) { 93 89 if (IsFiltered) 94 90 throw new InvalidOperationException("SetValues not possible while data is filtered"); … … 127 123 } 128 124 129 public void DeleteRows (IEnumerable<int> rows) {125 public void DeleteRowsWithIndices(IEnumerable<int> rows) { 130 126 if (IsFiltered) 131 127 throw new InvalidOperationException("DeleteRowsWithIndices not possible while data is filtered"); 132 128 133 originalData.DeleteRows (rows);129 originalData.DeleteRowsWithIndices(rows); 134 130 } 135 131 … … 277 273 public void EndTransaction() { 278 274 originalData.EndTransaction(); 275 } 276 #endregion 277 278 #region Statistics 279 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 280 return ActiveData.GetMin<T>(columnIndex, considerSelection, emptyValue); 281 } 282 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 283 return ActiveData.GetMax<T>(columnIndex, considerSelection, emptyValue); 284 } 285 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 286 return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue); 287 } 288 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 289 return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue); 290 } 291 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 292 return ActiveData.GetMode<T>(columnIndex, considerSelection, emptyValue); 293 } 294 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 295 return ActiveData.GetStandardDeviation<T>(columnIndex, considerSelection, emptyValue); 296 } 297 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 298 return ActiveData.GetVariance<T>(columnIndex, considerSelection, emptyValue); 299 } 300 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 301 return ActiveData.GetQuantile<T>(alpha, columnIndex, considerSelection, emptyValue); 302 } 303 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 304 return ActiveData.GetDistinctValues<T>(columnIndex, considerSelection); 305 } 306 307 public int GetMissingValueCount() { 308 return ActiveData.GetMissingValueCount(); 309 } 310 public int GetMissingValueCount(int columnIndex) { 311 return ActiveData.GetMissingValueCount(columnIndex); 312 } 313 public int GetRowMissingValueCount(int rowIndex) { 314 return ActiveData.GetRowMissingValueCount(rowIndex); 279 315 } 280 316 #endregion -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/IPreprocessingData.cs
r15309 r15431 28 28 namespace HeuristicLab.DataPreprocessing { 29 29 public interface IPreprocessingData : INamedItem { 30 31 IList<PreprocessingDataColumn> DataColumns { get; }32 33 30 #region Cells 34 31 bool IsCellEmpty(int columnIndex, int rowIndex); … … 39 36 string GetCellAsString(int columnIndex, int rowIndex); 40 37 41 I Enumerable<T> GetValues<T>(int columnIndex, bool considerSelection = false);38 IList<T> GetValues<T>(int columnIndex, bool considerSelection = false); 42 39 43 void SetValues<T>(int columnIndex, I Enumerable<T> values);40 void SetValues<T>(int columnIndex, IList<T> values); 44 41 bool SetValue(string value, int columnIndex, int rowIndex); 45 42 … … 51 48 void InsertRow(int rowIndex); 52 49 void DeleteRow(int rowIndex); 53 void DeleteRows (IEnumerable<int> rows);50 void DeleteRowsWithIndices(IEnumerable<int> rows); 54 51 void InsertColumn<T>(string variableName, int columnIndex); 55 52 … … 109 106 void EndTransaction(); 110 107 #endregion 108 109 #region Statistics 110 T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 111 T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 112 T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 113 T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>; 114 T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T>; 115 T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 116 T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 117 T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>; 118 int GetDistinctValues<T>(int columnIndex, bool considerSelection = false); 119 120 int GetMissingValueCount(); 121 int GetMissingValueCount(int columnIndex); 122 int GetRowMissingValueCount(int rowIndex); 123 #endregion 111 124 } 112 125 } -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
r15309 r15431 32 32 33 33 namespace HeuristicLab.DataPreprocessing { 34 34 35 [Item("PreprocessingData", "Represents data used for preprocessing.")] 35 36 [StorableClass] 36 37 public class PreprocessingData : NamedItem, IPreprocessingData { 37 38 38 [Storable] private List<PreprocessingDataColumn> dataColumns; 39 40 public IList<PreprocessingDataColumn> DataColumns { 41 get { return dataColumns; } 42 } 43 39 [Storable] 40 protected IList<IList> variableValues; 41 [Storable] 42 protected IList<string> variableNames; 44 43 45 44 #region Constructor, Cloning & Persistence … … 48 47 Name = "Preprocessing Data"; 49 48 50 dataColumns = new List<PreprocessingDataColumn>();51 49 Transformations = new List<ITransformation>(); 52 50 selection = new Dictionary<int, IList<int>>(); … … 59 57 protected PreprocessingData(PreprocessingData original, Cloner cloner) 60 58 : base(original, cloner) { 61 dataColumns = new List<PreprocessingDataColumn>(original.dataColumns.Select(cloner.Clone)); 62 TrainingPartition = cloner.Clone(original.TrainingPartition); 63 TestPartition = cloner.Clone(original.TestPartition); 59 variableValues = CopyVariableValues(original.variableValues); 60 variableNames = new List<string>(original.variableNames); 61 TrainingPartition = (IntRange)original.TrainingPartition.Clone(cloner); 62 TestPartition = (IntRange)original.TestPartition.Clone(cloner); 64 63 Transformations = new List<ITransformation>(original.Transformations.Select(cloner.Clone)); 65 64 … … 100 99 #region Cells 101 100 public bool IsCellEmpty(int columnIndex, int rowIndex) { 102 return !dataColumns[columnIndex].IsValidValue(rowIndex); 101 var value = variableValues[columnIndex][rowIndex]; 102 return IsMissingValue(value); 103 103 } 104 104 105 105 public T GetCell<T>(int columnIndex, int rowIndex) { 106 return dataColumns[columnIndex].TypeSwitch<T>( 107 c => c[rowIndex], 108 c => c[rowIndex], 109 c => c[rowIndex]); 106 return (T)variableValues[columnIndex][rowIndex]; 110 107 } 111 108 … … 118 115 InsertColumn<T>(i.ToString(), i); 119 116 120 dataColumns[columnIndex].TypeSwitch<T>(value, 121 (c, v) => c[rowIndex] = v, 122 (c, v) => c[rowIndex] = v, 123 (c, v) => c[rowIndex] = v); 124 117 variableValues[columnIndex][rowIndex] = value; 125 118 if (!IsInTransaction) 126 119 OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex); … … 128 121 129 122 public string GetCellAsString(int columnIndex, int rowIndex) { 130 return dataColumns[columnIndex].GetValue(rowIndex); 131 } 132 133 public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) { 134 return dataColumns[columnIndex].TypeSwitch<T>( 135 c => c.GetValues(considerSelection ? selection[columnIndex] : null), 136 c => c.GetValues(considerSelection ? selection[columnIndex] : null), 137 c => c.GetValues(considerSelection ? selection[columnIndex] : null)); 138 } 139 140 public void SetValues<T>(int columnIndex, IEnumerable<T> values) { 123 return variableValues[columnIndex][rowIndex].ToString(); 124 } 125 126 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 127 if (considerSelection) { 128 var list = new List<T>(); 129 foreach (var rowIdx in selection[columnIndex]) { 130 list.Add((T)variableValues[columnIndex][rowIdx]); 131 } 132 return list; 133 } else { 134 return (IList<T>)variableValues[columnIndex]; 135 } 136 } 137 138 public void SetValues<T>(int columnIndex, IList<T> values) { 141 139 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 142 140 if (VariableHasType<T>(columnIndex)) { 143 var name = dataColumns[columnIndex].Name; 144 if (dataColumns[columnIndex].IsType<double>()) { 145 dataColumns[columnIndex] = new DoublePreprocessingDataColumn(name, (IEnumerable<double>)values); 146 } else if (dataColumns[columnIndex].IsType<string>()) { 147 dataColumns[columnIndex] = new StringPreprocessingDataColumn(name, (IEnumerable<string>)values); 148 } else if (dataColumns[columnIndex].IsType<DateTime>()) { 149 dataColumns[columnIndex] = new DateTimePreprocessingDataColumn(name, (IEnumerable<DateTime>)values); 141 variableValues[columnIndex] = (IList)values; 142 } else { 143 throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + variableValues[columnIndex].GetType().Name + " but was " + typeof(T).Name); 144 } 145 if (!IsInTransaction) 146 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 147 } 148 149 public bool SetValue(string value, int columnIndex, int rowIndex) { 150 bool valid = false; 151 if (VariableHasType<double>(columnIndex)) { 152 double val; 153 if (string.IsNullOrWhiteSpace(value)) { 154 val = double.NaN; 155 valid = true; 150 156 } else { 151 throw new ArgumentException("Unknown column type"); 152 } 157 valid = double.TryParse(value, out val); 158 } 159 if (valid) 160 SetCell(columnIndex, rowIndex, val); 161 } else if (VariableHasType<string>(columnIndex)) { 162 valid = value != null; 163 if (valid) 164 SetCell(columnIndex, rowIndex, value); 165 } else if (VariableHasType<DateTime>(columnIndex)) { 166 DateTime date; 167 valid = DateTime.TryParse(value, out date); 168 if (valid) 169 SetCell(columnIndex, rowIndex, date); 153 170 } else { 154 throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + dataColumns[columnIndex].GetType().Name + " but was " + typeof(T).Name); 155 } 171 throw new ArgumentException("column " + columnIndex + " contains a non supported type."); 172 } 173 156 174 if (!IsInTransaction) 157 175 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 158 } 159 160 public bool SetValue(string value, int columnIndex, int rowIndex) { 161 var column = dataColumns[columnIndex]; 162 bool successful = column.SetValue(value, rowIndex); 163 164 if (!IsInTransaction) 165 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 166 167 return successful; 176 177 return valid; 168 178 } 169 179 170 180 public int Columns { 171 get { return dataColumns.Count; }181 get { return variableNames.Count; } 172 182 } 173 183 174 184 public int Rows { 175 get { return dataColumns.Any() ? dataColumns.Max(c => c.Length) : 0; } 185 get { return variableValues.Count > 0 ? variableValues[0].Count : 0; } 186 } 187 188 public static bool IsMissingValue(object value) { 189 if (value is double) return double.IsNaN((double)value); 190 if (value is string) return string.IsNullOrEmpty((string)value); 191 if (value is DateTime) return ((DateTime)value).Equals(DateTime.MinValue); 192 throw new ArgumentException(); 176 193 } 177 194 #endregion … … 180 197 public void InsertRow(int rowIndex) { 181 198 SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 182 183 foreach (var column in dataColumns) { 184 column.TypeSwitch( 185 c => c.Values.Insert(rowIndex, double.NaN), 186 c => c.Values.Insert(rowIndex, null), 187 c => c.Values.Insert(rowIndex, DateTime.MinValue)); 188 } 189 199 foreach (IList column in variableValues) { 200 Type type = column.GetType().GetGenericArguments()[0]; 201 column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null); 202 } 190 203 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 191 204 TrainingPartition.End++; … … 201 214 } 202 215 } 203 204 216 if (!IsInTransaction) 205 217 OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 206 218 } 207 208 219 public void DeleteRow(int rowIndex) { 209 DeleteRows(new[] { rowIndex }); 210 } 211 public void DeleteRows(IEnumerable<int> rowIndices) { 220 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 221 foreach (IList column in variableValues) { 222 column.RemoveAt(rowIndex); 223 } 224 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 225 TrainingPartition.End--; 226 if (TrainingPartition.End <= TestPartition.Start) { 227 TestPartition.Start--; 228 TestPartition.End--; 229 } 230 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 231 TestPartition.End--; 232 if (TestPartition.End <= TrainingPartition.Start) { 233 TestPartition.Start--; 234 TestPartition.End--; 235 } 236 } 237 if (!IsInTransaction) 238 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 239 } 240 public void DeleteRowsWithIndices(IEnumerable<int> rows) { 212 241 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1); 213 214 foreach (int rowIndex in rowIndices.OrderByDescending(x => x)) { 215 foreach (var column in dataColumns) { 216 column.TypeSwitch( 217 c => c.Values.RemoveAt(rowIndex), 218 c => c.Values.RemoveAt(rowIndex), 219 c => c.Values.RemoveAt(rowIndex)); 220 } 221 242 foreach (int rowIndex in rows.OrderByDescending(x => x)) { 243 foreach (IList column in variableValues) { 244 column.RemoveAt(rowIndex); 245 } 222 246 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 223 247 TrainingPartition.End--; … … 234 258 } 235 259 } 236 237 260 if (!IsInTransaction) 238 261 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1); … … 241 264 public void InsertColumn<T>(string variableName, int columnIndex) { 242 265 SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); 243 244 if (typeof(T) == typeof(double)) { 245 dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double>(double.NaN, Rows))); 246 } else if (typeof(T) == typeof(string)) { 247 dataColumns.Insert(columnIndex, new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(string.Empty, Rows))); 248 } else if (typeof(T) == typeof(DateTime)) { 249 dataColumns.Insert(columnIndex, new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime>(DateTime.MinValue, Rows))); 250 } else { 251 throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime"); 252 } 253 266 variableValues.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows))); 267 variableNames.Insert(columnIndex, variableName); 254 268 if (!IsInTransaction) 255 269 OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); … … 258 272 public void DeleteColumn(int columnIndex) { 259 273 SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); 260 261 dataColumns.RemoveAt(columnIndex); 262 274 variableValues.RemoveAt(columnIndex); 275 variableNames.RemoveAt(columnIndex); 263 276 if (!IsInTransaction) 264 277 OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); … … 266 279 267 280 public void RenameColumn(int columnIndex, string name) { 268 if (columnIndex < 0 || columnIndex > dataColumns.Count) 281 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 282 if (columnIndex < 0 || columnIndex > variableNames.Count) 269 283 throw new ArgumentOutOfRangeException("columnIndex"); 270 271 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 272 273 dataColumns[columnIndex].Name = name; 284 variableNames[columnIndex] = name; 274 285 275 286 if (!IsInTransaction) … … 279 290 public void RenameColumns(IList<string> names) { 280 291 if (names == null) throw new ArgumentNullException("names"); 281 if (names.Count != dataColumns.Count) throw new ArgumentException("number of names must match the number of columns.", "names");292 if (names.Count != variableNames.Count) throw new ArgumentException("number of names must match the number of columns.", "names"); 282 293 283 294 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 284 285 295 for (int i = 0; i < names.Count; i++) 286 dataColumns[i].Name= names[i];296 variableNames[i] = names[i]; 287 297 288 298 if (!IsInTransaction) … … 291 301 292 302 public bool AreAllStringColumns(IEnumerable<int> columnIndices) { 293 return columnIndices.All( VariableHasType<string>);303 return columnIndices.All(x => VariableHasType<string>(x)); 294 304 } 295 305 #endregion … … 297 307 #region Variables 298 308 public IEnumerable<string> VariableNames { 299 get { return dataColumns.Select(c => c.Name); }309 get { return variableNames; } 300 310 } 301 311 302 312 public IEnumerable<string> GetDoubleVariableNames() { 303 return dataColumns.OfType<DoublePreprocessingDataColumn>().Select(c => c.Name); 313 var doubleVariableNames = new List<string>(); 314 for (int i = 0; i < Columns; ++i) { 315 if (VariableHasType<double>(i)) { 316 doubleVariableNames.Add(variableNames[i]); 317 } 318 } 319 return doubleVariableNames; 304 320 } 305 321 306 322 public string GetVariableName(int columnIndex) { 307 return dataColumns[columnIndex].Name;323 return variableNames[columnIndex]; 308 324 } 309 325 310 326 public int GetColumnIndex(string variableName) { 311 return dataColumns.FindIndex(c => c.Name ==variableName);327 return variableNames.IndexOf(variableName); 312 328 } 313 329 314 330 public bool VariableHasType<T>(int columnIndex) { 315 return dataColumns[columnIndex].IsType<T>();331 return columnIndex >= variableValues.Count || variableValues[columnIndex] is List<T>; 316 332 } 317 333 318 334 public Type GetVariableType(int columnIndex) { 319 return dataColumns[columnIndex].GetValueType(); 335 var listType = variableValues[columnIndex].GetType(); 336 return listType.GenericTypeArguments.Single(); 320 337 } 321 338 … … 375 392 #region Import & Export 376 393 public void Import(IDataAnalysisProblemData problemData) { 377 var dataset = problemData.Dataset; 394 Dataset dataset = (Dataset)problemData.Dataset; 395 variableNames = new List<string>(problemData.Dataset.VariableNames); 378 396 InputVariables = new List<string>(problemData.AllowedInputVariables); 379 TargetVariable = problemData is IRegressionProblemData ? ((IRegressionProblemData)problemData).TargetVariable 380 : problemData is IClassificationProblemData ? ((IClassificationProblemData)problemData).TargetVariable 381 : null; 382 383 dataColumns.Clear(); 397 TargetVariable = (problemData is IRegressionProblemData) ? ((IRegressionProblemData)problemData).TargetVariable 398 : (problemData is IClassificationProblemData) ? ((IClassificationProblemData)problemData).TargetVariable 399 : null; 400 401 int columnIndex = 0; 402 variableValues = new List<IList>(); 384 403 foreach (var variableName in problemData.Dataset.VariableNames) { 385 404 if (dataset.VariableHasType<double>(variableName)) { 386 dataColumns.Add(new DoublePreprocessingDataColumn(variableName, dataset.GetDoubleValues(variableName)));405 variableValues.Insert(columnIndex, dataset.GetDoubleValues(variableName).ToList()); 387 406 } else if (dataset.VariableHasType<string>(variableName)) { 388 dataColumns.Add(new StringPreprocessingDataColumn(variableName, dataset.GetStringValues(variableName)));407 variableValues.Insert(columnIndex, dataset.GetStringValues(variableName).ToList()); 389 408 } else if (dataset.VariableHasType<DateTime>(variableName)) { 390 dataColumns.Add(new DateTimePreprocessingDataColumn(variableName, dataset.GetDateTimeValues(variableName)));409 variableValues.Insert(columnIndex, dataset.GetDateTimeValues(variableName).ToList()); 391 410 } else { 392 411 throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime"); 393 412 } 413 ++columnIndex; 394 414 } 395 415 … … 401 421 IList<IList> values = new List<IList>(); 402 422 403 for (int i = 0; i < Columns; i++) { 404 var doubleColumn = dataColumns[i] as DoublePreprocessingDataColumn; 405 var stringColumn = dataColumns[i] as StringPreprocessingDataColumn; 406 var dateTimeColumn = dataColumns[i] as DateTimePreprocessingDataColumn; 407 if (doubleColumn != null) values.Add(new List<double>(doubleColumn.GetValues())); 408 else if (stringColumn != null) values.Add(new List<string>(stringColumn.GetValues())); 409 else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn.GetValues())); 410 else throw new InvalidOperationException("Column type not supported for export"); 411 } 412 413 return new Dataset(VariableNames, values); 423 for (int i = 0; i < Columns; ++i) { 424 values.Add(variableValues[i]); 425 } 426 427 var dataset = new Dataset(variableNames, values); 428 return dataset; 414 429 } 415 430 #endregion … … 437 452 438 453 #region Transactions 439 // S napshot/History are not storable/cloneable on purpose454 // Stapshot/History are nost storable/cloneable on purpose 440 455 private class Snapshot { 441 public List<PreprocessingDataColumn> DataColumns { get; set; } 456 public IList<IList> VariableValues { get; set; } 457 public IList<string> VariableNames { get; set; } 442 458 443 459 public IntRange TrainingPartition { get; set; } … … 456 472 } 457 473 458 private const int M axUndoDepth= 5;474 private const int MAX_UNDO_DEPTH = 5; 459 475 460 476 private readonly IList<Snapshot> undoHistory = new List<Snapshot>(); … … 466 482 if (IsInTransaction) return; 467 483 468 var cloner = new Cloner();469 484 var currentSnapshot = new Snapshot { 470 DataColumns = new List<PreprocessingDataColumn>(dataColumns.Select(cloner.Clone)), 485 VariableValues = CopyVariableValues(variableValues), 486 VariableNames = new List<string>(variableNames), 471 487 TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End), 472 488 TestPartition = new IntRange(TestPartition.Start, TestPartition.End), … … 477 493 }; 478 494 479 if (undoHistory.Count >= M axUndoDepth)495 if (undoHistory.Count >= MAX_UNDO_DEPTH) 480 496 undoHistory.RemoveAt(0); 481 497 … … 490 506 if (IsUndoAvailable) { 491 507 Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1]; 492 dataColumns = previousSnapshot.DataColumns; 508 variableValues = previousSnapshot.VariableValues; 509 variableNames = previousSnapshot.VariableNames; 493 510 TrainingPartition = previousSnapshot.TrainingPartition; 494 511 TestPartition = previousSnapshot.TestPartition; … … 521 538 #endregion 522 539 523 /* #region Statistics 524 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 525 try { 526 return dataColumns[columnIndex].TypeSwitch<T>( 527 col => col.GetMin(considerSelection ? Selection[columnIndex] : null), 528 col => col.GetMin(considerSelection ? Selection[columnIndex] : null), 529 col => col.GetMin(considerSelection ? Selection[columnIndex] : null)); 530 } catch (InvalidOperationException) { 531 return emptyValue; 532 } 533 } 534 535 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 536 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 537 return values.Any() ? values.Max() : emptyValue; 538 } 539 540 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 541 return 542 543 544 if (typeof(T) == typeof(double)) { 545 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 546 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 547 } 548 if (typeof(T) == typeof(string)) { 549 return Convert<T>(string.Empty); 550 } 551 if (typeof(T) == typeof(DateTime)) { 552 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 553 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 554 } 555 556 throw new InvalidOperationException(typeof(T) + " not supported"); 557 } 558 559 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 560 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 561 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 562 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 563 } 564 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 565 return values.Any() ? values.Quantile(0.5) : emptyValue; 566 } 567 568 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 569 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 570 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 571 } 572 573 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 574 if (typeof(T) == typeof(double)) { 575 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 576 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 577 } 578 // For DateTime, std.dev / variance would have to be TimeSpan 579 //if (typeof(T) == typeof(DateTime)) { 580 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 581 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 582 //} 583 return default(T); 584 } 585 586 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 587 if (typeof(T) == typeof(double)) { 588 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 589 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 590 } 591 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 592 //if (typeof(T) == typeof(DateTime)) { 593 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 594 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 595 //} 596 return default(T); 597 } 598 599 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 600 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 601 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 602 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 603 } 604 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 605 return values.Any() ? values.Quantile(alpha) : emptyValue; 606 } 607 608 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 609 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 610 return values.GroupBy(x => x).Count(); 611 } 612 613 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 614 return GetValues<T>(columnIndex, considerSelection).Where(x => 615 ColumnTypeSwitch<T, bool>(dataColumns[columnIndex], x, 616 (c, v) => c.IsValidValue(v), 617 (c, v) => c.IsValidValue(v), 618 (c, v) => c.IsValidValue(v) 619 )); 620 } 621 622 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 623 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 624 } 625 626 public int GetMissingValueCount() { 627 int count = 0; 628 for (int i = 0; i < Columns; ++i) { 629 count += GetMissingValueCount(i); 630 } 631 return count; 632 } 633 public int GetMissingValueCount(int columnIndex) { 634 int sum = 0; 635 for (int i = 0; i < Rows; i++) { 636 if (IsCellEmpty(columnIndex, i)) 637 sum++; 638 } 639 return sum; 640 } 641 public int GetRowMissingValueCount(int rowIndex) { 642 int sum = 0; 643 for (int i = 0; i < Columns; i++) { 644 if (IsCellEmpty(i, rowIndex)) 645 sum++; 646 } 647 return sum; 648 } 649 #endregion */ 540 #region Statistics 541 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 542 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 543 return values.Any() ? values.Min() : emptyValue; 544 } 545 546 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 547 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 548 return values.Any() ? values.Max() : emptyValue; 549 } 550 551 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 552 if (typeof(T) == typeof(double)) { 553 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 554 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 555 } 556 if (typeof(T) == typeof(string)) { 557 return Convert<T>(string.Empty); 558 } 559 if (typeof(T) == typeof(DateTime)) { 560 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 561 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 562 } 563 564 throw new InvalidOperationException(typeof(T) + " not supported"); 565 } 566 567 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 568 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 569 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 570 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 571 } 572 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 573 return values.Any() ? values.Quantile(0.5) : emptyValue; 574 } 575 576 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 577 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 578 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 579 } 580 581 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 582 if (typeof(T) == typeof(double)) { 583 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 584 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 585 } 586 // For DateTime, std.dev / variance would have to be TimeSpan 587 //if (typeof(T) == typeof(DateTime)) { 588 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 589 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 590 //} 591 return default(T); 592 } 593 594 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 595 if (typeof(T) == typeof(double)) { 596 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 597 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 598 } 599 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 600 //if (typeof(T) == typeof(DateTime)) { 601 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 602 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 603 //} 604 return default(T); 605 } 606 607 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 608 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 609 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 610 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 611 } 612 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 613 return values.Any() ? values.Quantile(alpha) : emptyValue; 614 } 615 616 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 617 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 618 return values.GroupBy(x => x).Count(); 619 } 620 621 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 622 return GetValues<T>(columnIndex, considerSelection).Where(x => !IsMissingValue(x)); 623 } 624 625 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 626 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 627 } 628 private static T Convert<T>(object obj) { return (T)obj; } 629 630 public int GetMissingValueCount() { 631 int count = 0; 632 for (int i = 0; i < Columns; ++i) { 633 count += GetMissingValueCount(i); 634 } 635 return count; 636 } 637 public int GetMissingValueCount(int columnIndex) { 638 int sum = 0; 639 for (int i = 0; i < Rows; i++) { 640 if (IsCellEmpty(columnIndex, i)) 641 sum++; 642 } 643 return sum; 644 } 645 public int GetRowMissingValueCount(int rowIndex) { 646 int sum = 0; 647 for (int i = 0; i < Columns; i++) { 648 if (IsCellEmpty(i, rowIndex)) 649 sum++; 650 } 651 return sum; 652 } 653 #endregion 654 655 #region Helpers 656 private static IList<IList> CopyVariableValues(IList<IList> original) { 657 var copy = new List<IList>(original); 658 for (int i = 0; i < original.Count; ++i) { 659 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]); 660 } 661 return copy; 662 } 663 #endregion 650 664 } 651 665
Note: See TracChangeset
for help on using the changeset viewer.