Changeset 16057 for branches/2839_HiveProjectManagement/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
- Timestamp:
- 08/06/18 18:15:29 (6 years ago)
- Location:
- branches/2839_HiveProjectManagement
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2839_HiveProjectManagement
- Property svn:mergeinfo changed
-
branches/2839_HiveProjectManagement/HeuristicLab.DataPreprocessing
- Property svn:mergeinfo changed
-
branches/2839_HiveProjectManagement/HeuristicLab.DataPreprocessing/3.4
- Property svn:mergeinfo changed
-
branches/2839_HiveProjectManagement/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
r15110 r16057 1 1 #region License Information 2 2 /* HeuristicLab 3 * Copyright (C) 2002-201 6Heuristic and Evolutionary Algorithms Laboratory (HEAL)3 * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 4 * 5 5 * This file is part of HeuristicLab. … … 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.Globalization; 25 26 using System.Linq; 26 27 using HeuristicLab.Common; 27 28 using HeuristicLab.Core; 28 29 using HeuristicLab.Data; 30 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 29 31 using HeuristicLab.Problems.DataAnalysis; 30 32 … … 32 34 33 35 [Item("PreprocessingData", "Represents data used for preprocessing.")] 34 public abstract class PreprocessingData : NamedItem, IPreprocessingData { 35 public IntRange TrainingPartition { get; set; } 36 public IntRange TestPartition { get; set; } 37 38 public IList<ITransformation> Transformations { get; protected set; } 39 36 [StorableClass] 37 public class PreprocessingData : NamedItem, IPreprocessingData { 38 39 [Storable] 40 40 protected IList<IList> variableValues; 41 [Storable] 41 42 protected IList<string> variableNames; 42 43 43 public IEnumerable<string> VariableNames { 44 get { return variableNames; } 45 } 46 47 public IEnumerable<string> GetDoubleVariableNames() { 48 var doubleVariableNames = new List<string>(); 49 for (int i = 0; i < Columns; ++i) { 50 if (VariableHasType<double>(i)) { 51 doubleVariableNames.Add(variableNames[i]); 52 } 53 } 54 return doubleVariableNames; 55 } 56 57 public IList<string> InputVariables { get; private set; } 58 public string TargetVariable { get; private set; } // optional 59 60 public int Columns { 61 get { return variableNames.Count; } 62 } 63 64 public int Rows { 65 get { return variableValues.Count > 0 ? variableValues[0].Count : 0; } 66 } 67 68 protected IDictionary<int, IList<int>> selection; 69 public IDictionary<int, IList<int>> Selection { 70 get { return selection; } 71 set { 72 selection = value; 73 OnSelectionChanged(); 74 } 44 #region Constructor, Cloning & Persistence 45 public PreprocessingData(IDataAnalysisProblemData problemData) 46 : base() { 47 Name = "Preprocessing Data"; 48 49 Transformations = new List<ITransformation>(); 50 selection = new Dictionary<int, IList<int>>(); 51 52 Import(problemData); 53 54 RegisterEventHandler(); 75 55 } 76 56 … … 88 68 RegisterEventHandler(); 89 69 } 90 91 protected PreprocessingData(IDataAnalysisProblemData problemData) 92 : base() { 93 Name = "Preprocessing Data"; 94 95 Transformations = new List<ITransformation>(); 96 selection = new Dictionary<int, IList<int>>(); 97 98 Import(problemData); 99 70 public override IDeepCloneable Clone(Cloner cloner) { 71 return new PreprocessingData(this, cloner); 72 } 73 74 [StorableConstructor] 75 protected PreprocessingData(bool deserializing) 76 : base(deserializing) { } 77 [StorableHook(HookType.AfterDeserialization)] 78 private void AfterDeserialization() { 100 79 RegisterEventHandler(); 101 80 } 102 81 82 private void RegisterEventHandler() { 83 Changed += (s, e) => { 84 switch (e.Type) { 85 case DataPreprocessingChangedEventType.DeleteRow: 86 case DataPreprocessingChangedEventType.Any: 87 case DataPreprocessingChangedEventType.Transformation: 88 int maxRowIndex = Math.Max(0, Rows); 89 TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex); 90 TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex); 91 TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex); 92 TestPartition.End = Math.Min(TestPartition.End, maxRowIndex); 93 break; 94 } 95 }; 96 } 97 #endregion 98 99 #region Cells 100 public bool IsCellEmpty(int columnIndex, int rowIndex) { 101 var value = variableValues[columnIndex][rowIndex]; 102 return IsMissingValue(value); 103 } 104 105 public T GetCell<T>(int columnIndex, int rowIndex) { 106 return (T)variableValues[columnIndex][rowIndex]; 107 } 108 109 public void SetCell<T>(int columnIndex, int rowIndex, T value) { 110 SaveSnapshot(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex); 111 112 for (int i = Rows; i <= rowIndex; i++) 113 InsertRow(i); 114 for (int i = Columns; i <= columnIndex; i++) 115 InsertColumn<T>(i.ToString(), i); 116 117 variableValues[columnIndex][rowIndex] = value; 118 if (!IsInTransaction) 119 OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex); 120 } 121 122 public string GetCellAsString(int columnIndex, int rowIndex) { 123 return variableValues[columnIndex][rowIndex].ToString(); 124 } 125 126 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 127 if (considerSelection) { 128 var list = new List<T>(); 129 foreach (var rowIdx in selection[columnIndex]) { 130 list.Add((T)variableValues[columnIndex][rowIdx]); 131 } 132 return list; 133 } else { 134 return (IList<T>)variableValues[columnIndex]; 135 } 136 } 137 138 public void SetValues<T>(int columnIndex, IList<T> values) { 139 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 140 if (VariableHasType<T>(columnIndex)) { 141 variableValues[columnIndex] = (IList)values; 142 } else { 143 throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + variableValues[columnIndex].GetType().Name + " but was " + typeof(T).Name); 144 } 145 if (!IsInTransaction) 146 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 147 } 148 149 public bool SetValue(string value, int columnIndex, int rowIndex) { 150 bool valid = false; 151 if (VariableHasType<double>(columnIndex)) { 152 double val; 153 if (string.IsNullOrWhiteSpace(value)) { 154 val = double.NaN; 155 valid = true; 156 } else { 157 valid = double.TryParse(value, out val); 158 } 159 if (valid) 160 SetCell(columnIndex, rowIndex, val); 161 } else if (VariableHasType<string>(columnIndex)) { 162 valid = value != null; 163 if (valid) 164 SetCell(columnIndex, rowIndex, value); 165 } else if (VariableHasType<DateTime>(columnIndex)) { 166 DateTime date; 167 valid = DateTime.TryParse(value, out date); 168 if (valid) 169 SetCell(columnIndex, rowIndex, date); 170 } else { 171 throw new ArgumentException("column " + columnIndex + " contains a non supported type."); 172 } 173 174 if (!IsInTransaction) 175 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 176 177 return valid; 178 } 179 180 public int Columns { 181 get { return variableNames.Count; } 182 } 183 184 public int Rows { 185 get { return variableValues.Count > 0 ? variableValues[0].Count : 0; } 186 } 187 188 public static bool IsMissingValue(object value) { 189 if (value is double) return double.IsNaN((double)value); 190 if (value is string) return string.IsNullOrEmpty((string)value); 191 if (value is DateTime) return ((DateTime)value).Equals(DateTime.MinValue); 192 throw new ArgumentException(); 193 } 194 #endregion 195 196 #region Rows 197 public void InsertRow(int rowIndex) { 198 SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 199 foreach (IList column in variableValues) { 200 Type type = column.GetType().GetGenericArguments()[0]; 201 column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null); 202 } 203 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 204 TrainingPartition.End++; 205 if (TrainingPartition.End <= TestPartition.Start) { 206 TestPartition.Start++; 207 TestPartition.End++; 208 } 209 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 210 TestPartition.End++; 211 if (TestPartition.End <= TrainingPartition.Start) { 212 TestPartition.Start++; 213 TestPartition.End++; 214 } 215 } 216 if (!IsInTransaction) 217 OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 218 } 219 public void DeleteRow(int rowIndex) { 220 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 221 foreach (IList column in variableValues) { 222 column.RemoveAt(rowIndex); 223 } 224 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 225 TrainingPartition.End--; 226 if (TrainingPartition.End <= TestPartition.Start) { 227 TestPartition.Start--; 228 TestPartition.End--; 229 } 230 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 231 TestPartition.End--; 232 if (TestPartition.End <= TrainingPartition.Start) { 233 TestPartition.Start--; 234 TestPartition.End--; 235 } 236 } 237 if (!IsInTransaction) 238 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 239 } 240 public void DeleteRowsWithIndices(IEnumerable<int> rows) { 241 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1); 242 foreach (int rowIndex in rows.OrderByDescending(x => x)) { 243 foreach (IList column in variableValues) { 244 column.RemoveAt(rowIndex); 245 } 246 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 247 TrainingPartition.End--; 248 if (TrainingPartition.End <= TestPartition.Start) { 249 TestPartition.Start--; 250 TestPartition.End--; 251 } 252 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 253 TestPartition.End--; 254 if (TestPartition.End <= TrainingPartition.Start) { 255 TestPartition.Start--; 256 TestPartition.End--; 257 } 258 } 259 } 260 if (!IsInTransaction) 261 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1); 262 } 263 264 public void InsertColumn<T>(string variableName, int columnIndex) { 265 SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); 266 variableValues.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows))); 267 variableNames.Insert(columnIndex, variableName); 268 if (!IsInTransaction) 269 OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); 270 } 271 272 public void DeleteColumn(int columnIndex) { 273 SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); 274 variableValues.RemoveAt(columnIndex); 275 variableNames.RemoveAt(columnIndex); 276 if (!IsInTransaction) 277 OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); 278 } 279 280 public void RenameColumn(int columnIndex, string name) { 281 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 282 if (columnIndex < 0 || columnIndex > variableNames.Count) 283 throw new ArgumentOutOfRangeException("columnIndex"); 284 variableNames[columnIndex] = name; 285 286 if (!IsInTransaction) 287 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 288 } 289 290 public void RenameColumns(IList<string> names) { 291 if (names == null) throw new ArgumentNullException("names"); 292 if (names.Count != variableNames.Count) throw new ArgumentException("number of names must match the number of columns.", "names"); 293 294 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 295 for (int i = 0; i < names.Count; i++) 296 variableNames[i] = names[i]; 297 298 if (!IsInTransaction) 299 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 300 } 301 302 public bool AreAllStringColumns(IEnumerable<int> columnIndices) { 303 return columnIndices.All(x => VariableHasType<string>(x)); 304 } 305 #endregion 306 307 #region Variables 308 public IEnumerable<string> VariableNames { 309 get { return variableNames; } 310 } 311 312 public IEnumerable<string> GetDoubleVariableNames() { 313 var doubleVariableNames = new List<string>(); 314 for (int i = 0; i < Columns; ++i) { 315 if (VariableHasType<double>(i)) { 316 doubleVariableNames.Add(variableNames[i]); 317 } 318 } 319 return doubleVariableNames; 320 } 321 322 public string GetVariableName(int columnIndex) { 323 return variableNames[columnIndex]; 324 } 325 326 public int GetColumnIndex(string variableName) { 327 return variableNames.IndexOf(variableName); 328 } 329 330 public bool VariableHasType<T>(int columnIndex) { 331 return columnIndex >= variableValues.Count || variableValues[columnIndex] is List<T>; 332 } 333 334 public Type GetVariableType(int columnIndex) { 335 var listType = variableValues[columnIndex].GetType(); 336 return listType.GenericTypeArguments.Single(); 337 } 338 339 public IList<string> InputVariables { get; private set; } 340 public string TargetVariable { get; private set; } // optional 341 #endregion 342 343 #region Partitions 344 [Storable] 345 public IntRange TrainingPartition { get; set; } 346 [Storable] 347 public IntRange TestPartition { get; set; } 348 #endregion 349 350 #region Transformations 351 [Storable] 352 public IList<ITransformation> Transformations { get; protected set; } 353 #endregion 354 355 #region Validation 356 public bool Validate(string value, out string errorMessage, int columnIndex) { 357 if (columnIndex < 0 || columnIndex > VariableNames.Count()) { 358 throw new ArgumentOutOfRangeException("column index is out of range"); 359 } 360 361 bool valid = false; 362 errorMessage = string.Empty; 363 if (VariableHasType<double>(columnIndex)) { 364 if (string.IsNullOrWhiteSpace(value)) { 365 valid = true; 366 } else { 367 double val; 368 valid = double.TryParse(value, out val); 369 if (!valid) { 370 errorMessage = "Invalid Value (Valid Value Format: \"" + FormatPatterns.GetDoubleFormatPattern() + "\")"; 371 } 372 } 373 } else if (VariableHasType<string>(columnIndex)) { 374 valid = value != null; 375 if (!valid) { 376 errorMessage = "Invalid Value (string must not be null)"; 377 } 378 } else if (VariableHasType<DateTime>(columnIndex)) { 379 DateTime date; 380 valid = DateTime.TryParse(value, out date); 381 if (!valid) { 382 errorMessage = "Invalid Value (Valid Value Format: \"" + CultureInfo.CurrentCulture.DateTimeFormat + "\""; 383 } 384 } else { 385 throw new ArgumentException("column " + columnIndex + " contains a non supported type."); 386 } 387 388 return valid; 389 } 390 #endregion 391 392 #region Import & Export 103 393 public void Import(IDataAnalysisProblemData problemData) { 104 394 Dataset dataset = (Dataset)problemData.Dataset; … … 107 397 TargetVariable = (problemData is IRegressionProblemData) ? ((IRegressionProblemData)problemData).TargetVariable 108 398 : (problemData is IClassificationProblemData) ? ((IClassificationProblemData)problemData).TargetVariable 109 : null;399 : null; 110 400 111 401 int columnIndex = 0; … … 128 418 } 129 419 130 private void RegisterEventHandler() { 131 Changed += (s, e) => { 132 switch (e.Type) { 133 case DataPreprocessingChangedEventType.DeleteRow: 134 CheckPartitionRanges(); 135 break; 136 case DataPreprocessingChangedEventType.Any: 137 CheckPartitionRanges(); 138 break; 139 case DataPreprocessingChangedEventType.Transformation: 140 CheckPartitionRanges(); 141 break; 142 } 143 }; 144 } 145 146 private void CheckPartitionRanges() { 147 int maxRowIndex = Math.Max(0, Rows); 148 TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex); 149 TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex); 150 TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex); 151 TestPartition.End = Math.Min(TestPartition.End, maxRowIndex); 152 } 153 154 protected IList<IList> CopyVariableValues(IList<IList> original) { 155 var copy = new List<IList>(original); 156 for (int i = 0; i < original.Count; ++i) { 157 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]); 158 } 159 return copy; 160 } 161 162 163 #region IPreprocessingData Members 164 public abstract T GetCell<T>(int columnIndex, int rowIndex); 165 166 public abstract void SetCell<T>(int columnIndex, int rowIndex, T value); 167 168 public abstract string GetCellAsString(int columnIndex, int rowIndex); 169 170 public abstract string GetVariableName(int columnIndex); 171 172 public abstract int GetColumnIndex(string variableName); 173 174 public abstract bool VariableHasType<T>(int columnIndex); 175 176 [Obsolete("use the index based variant, is faster")] 177 public abstract IList<T> GetValues<T>(string variableName, bool considerSelection); 178 179 public abstract IList<T> GetValues<T>(int columnIndex, bool considerSelection); 180 181 public abstract void SetValues<T>(int columnIndex, IList<T> values); 182 183 public abstract bool SetValue(string value, int columnIndex, int rowIndex); 184 185 public abstract bool Validate(string value, out string errorMessage, int columnIndex); 186 187 public abstract bool AreAllStringColumns(IEnumerable<int> columnIndices); 188 189 public abstract void DeleteRowsWithIndices(IEnumerable<int> rows); 190 191 public abstract void InsertRow(int rowIndex); 192 193 public abstract void DeleteRow(int rowIndex); 194 195 public abstract void InsertColumn<T>(string variableName, int columnIndex); 196 197 public abstract void DeleteColumn(int columnIndex); 198 199 public abstract void RenameColumn(int columnIndex, string name); 200 public abstract void RenameColumns(IList<string> list); 201 202 public abstract Dataset ExportToDataset(); 203 204 public abstract void ClearSelection(); 205 206 public abstract event EventHandler SelectionChanged; 207 protected abstract void OnSelectionChanged(); 420 public Dataset ExportToDataset() { 421 IList<IList> values = new List<IList>(); 422 423 for (int i = 0; i < Columns; ++i) { 424 values.Add(variableValues[i]); 425 } 426 427 var dataset = new Dataset(variableNames, values); 428 return dataset; 429 } 430 #endregion 431 432 #region Selection 433 [Storable] 434 protected IDictionary<int, IList<int>> selection; 435 public IDictionary<int, IList<int>> Selection { 436 get { return selection; } 437 set { 438 selection = value; 439 OnSelectionChanged(); 440 } 441 } 442 public void ClearSelection() { 443 Selection = new Dictionary<int, IList<int>>(); 444 } 445 446 public event EventHandler SelectionChanged; 447 protected void OnSelectionChanged() { 448 var listeners = SelectionChanged; 449 if (listeners != null) listeners(this, EventArgs.Empty); 450 } 451 #endregion 452 453 #region Transactions 454 // Stapshot/History are nost storable/cloneable on purpose 455 private class Snapshot { 456 public IList<IList> VariableValues { get; set; } 457 public IList<string> VariableNames { get; set; } 458 459 public IntRange TrainingPartition { get; set; } 460 public IntRange TestPartition { get; set; } 461 public IList<ITransformation> Transformations { get; set; } 462 public DataPreprocessingChangedEventType ChangedType { get; set; } 463 464 public int ChangedColumn { get; set; } 465 public int ChangedRow { get; set; } 466 } 208 467 209 468 public event DataPreprocessingChangedEventHandler Changed; … … 212 471 if (listeners != null) listeners(this, new DataPreprocessingChangedEventArgs(type, column, row)); 213 472 } 214 #endregion 473 474 private const int MAX_UNDO_DEPTH = 5; 475 476 private readonly IList<Snapshot> undoHistory = new List<Snapshot>(); 477 private readonly Stack<DataPreprocessingChangedEventType> eventStack = new Stack<DataPreprocessingChangedEventType>(); 478 479 public bool IsInTransaction { get { return eventStack.Count > 0; } } 480 481 private void SaveSnapshot(DataPreprocessingChangedEventType changedType, int column, int row) { 482 if (IsInTransaction) return; 483 484 var currentSnapshot = new Snapshot { 485 VariableValues = CopyVariableValues(variableValues), 486 VariableNames = new List<string>(variableNames), 487 TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End), 488 TestPartition = new IntRange(TestPartition.Start, TestPartition.End), 489 Transformations = new List<ITransformation>(Transformations), 490 ChangedType = changedType, 491 ChangedColumn = column, 492 ChangedRow = row 493 }; 494 495 if (undoHistory.Count >= MAX_UNDO_DEPTH) 496 undoHistory.RemoveAt(0); 497 498 undoHistory.Add(currentSnapshot); 499 } 500 501 public bool IsUndoAvailable { 502 get { return undoHistory.Count > 0; } 503 } 504 505 public void Undo() { 506 if (IsUndoAvailable) { 507 Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1]; 508 variableValues = previousSnapshot.VariableValues; 509 variableNames = previousSnapshot.VariableNames; 510 TrainingPartition = previousSnapshot.TrainingPartition; 511 TestPartition = previousSnapshot.TestPartition; 512 Transformations = previousSnapshot.Transformations; 513 undoHistory.Remove(previousSnapshot); 514 OnChanged(previousSnapshot.ChangedType, 515 previousSnapshot.ChangedColumn, 516 previousSnapshot.ChangedRow); 517 } 518 } 519 520 public void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any) { 521 BeginTransaction(type); 522 action(); 523 EndTransaction(); 524 } 525 526 public void BeginTransaction(DataPreprocessingChangedEventType type) { 527 SaveSnapshot(type, -1, -1); 528 eventStack.Push(type); 529 } 530 531 public void EndTransaction() { 532 if (eventStack.Count == 0) 533 throw new InvalidOperationException("There is no open transaction that can be ended."); 534 535 var @event = eventStack.Pop(); 536 OnChanged(@event, -1, -1); 537 } 538 #endregion 539 540 #region Statistics 541 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 542 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 543 return values.Any() ? values.Min() : emptyValue; 544 } 545 546 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 547 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 548 return values.Any() ? values.Max() : emptyValue; 549 } 550 551 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 552 if (typeof(T) == typeof(double)) { 553 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 554 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 555 } 556 if (typeof(T) == typeof(string)) { 557 return Convert<T>(string.Empty); 558 } 559 if (typeof(T) == typeof(DateTime)) { 560 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 561 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 562 } 563 564 throw new InvalidOperationException(typeof(T) + " not supported"); 565 } 566 567 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 568 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 569 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 570 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 571 } 572 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 573 return values.Any() ? values.Quantile(0.5) : emptyValue; 574 } 575 576 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 577 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 578 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 579 } 580 581 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 582 if (typeof(T) == typeof(double)) { 583 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 584 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 585 } 586 // For DateTime, std.dev / variance would have to be TimeSpan 587 //if (typeof(T) == typeof(DateTime)) { 588 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 589 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 590 //} 591 return default(T); 592 } 593 594 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 595 if (typeof(T) == typeof(double)) { 596 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 597 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 598 } 599 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 600 //if (typeof(T) == typeof(DateTime)) { 601 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 602 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 603 //} 604 return default(T); 605 } 606 607 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 608 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 609 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 610 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 611 } 612 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 613 return values.Any() ? values.Quantile(alpha) : emptyValue; 614 } 615 616 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 617 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 618 return values.GroupBy(x => x).Count(); 619 } 620 621 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 622 return GetValues<T>(columnIndex, considerSelection).Where(x => !IsMissingValue(x)); 623 } 624 625 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 626 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 627 } 628 private static T Convert<T>(object obj) { return (T)obj; } 629 630 public int GetMissingValueCount() { 631 int count = 0; 632 for (int i = 0; i < Columns; ++i) { 633 count += GetMissingValueCount(i); 634 } 635 return count; 636 } 637 public int GetMissingValueCount(int columnIndex) { 638 int sum = 0; 639 for (int i = 0; i < Rows; i++) { 640 if (IsCellEmpty(columnIndex, i)) 641 sum++; 642 } 643 return sum; 644 } 645 public int GetRowMissingValueCount(int rowIndex) { 646 int sum = 0; 647 for (int i = 0; i < Columns; i++) { 648 if (IsCellEmpty(i, rowIndex)) 649 sum++; 650 } 651 return sum; 652 } 653 #endregion 654 655 #region Helpers 656 private static IList<IList> CopyVariableValues(IList<IList> original) { 657 var copy = new List<IList>(original); 658 for (int i = 0; i < original.Count; ++i) { 659 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]); 660 } 661 return copy; 662 } 663 #endregion 664 } 665 666 // Adapted from HeuristicLab.Common.EnumerableStatisticExtensions 667 internal static class EnumerableExtensions { 668 public static T Quantile<T>(this IEnumerable<T> values, double alpha) where T : IComparable<T> { 669 T[] valuesArr = values.ToArray(); 670 int n = valuesArr.Length; 671 if (n == 0) throw new InvalidOperationException("Enumeration contains no elements."); 672 673 var pos = n * alpha; 674 675 return Select((int)Math.Ceiling(pos) - 1, valuesArr); 676 677 } 678 679 private static T Select<T>(int k, T[] arr) where T : IComparable<T> { 680 int i, ir, j, l, mid, n = arr.Length; 681 T a; 682 l = 0; 683 ir = n - 1; 684 for (;;) { 685 if (ir <= l + 1) { 686 // Active partition contains 1 or 2 elements. 687 if (ir == l + 1 && arr[ir].CompareTo(arr[l]) < 0) { 688 // Case of 2 elements. 689 Swap(arr, l, ir); 690 } 691 return arr[k]; 692 } else { 693 mid = (l + ir) >> 1; // Choose median of left, center, and right elements 694 Swap(arr, mid, l + 1); // as partitioning element a. Also 695 696 if (arr[l].CompareTo(arr[ir]) > 0) { // rearrange so that arr[l] arr[ir] <= arr[l+1], 697 Swap(arr, l, ir); // . arr[ir] >= arr[l+1] 698 } 699 700 if (arr[l + 1].CompareTo(arr[ir]) > 0) { 701 Swap(arr, l + 1, ir); 702 } 703 if (arr[l].CompareTo(arr[l + 1]) > 0) { 704 Swap(arr, l, l + 1); 705 } 706 i = l + 1; // Initialize pointers for partitioning. 707 j = ir; 708 a = arr[l + 1]; // Partitioning element. 709 for (;;) { // Beginning of innermost loop. 710 do i++; while (arr[i].CompareTo(a) < 0); // Scan up to find element > a. 711 do j--; while (arr[j].CompareTo(a) > 0); // Scan down to find element < a. 712 if (j < i) break; // Pointers crossed. Partitioning complete. 713 Swap(arr, i, j); 714 } // End of innermost loop. 715 arr[l + 1] = arr[j]; // Insert partitioning element. 716 arr[j] = a; 717 if (j >= k) ir = j - 1; // Keep active the partition that contains the 718 if (j <= k) l = i; // kth element. 719 } 720 } 721 } 722 723 private static void Swap<T>(T[] arr, int i, int j) { 724 T temp = arr[i]; 725 arr[i] = arr[j]; 726 arr[j] = temp; 727 } 215 728 } 216 729 }
Note: See TracChangeset
for help on using the changeset viewer.