Changeset 15309 for branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
- Timestamp:
- 08/07/17 09:43:58 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
r15291 r15309 36 36 public class PreprocessingData : NamedItem, IPreprocessingData { 37 37 38 [Storable] 39 protected List<PreprocessingDataColumn> dataColumns; 38 [Storable] private List<PreprocessingDataColumn> dataColumns; 39 40 public IList<PreprocessingDataColumn> DataColumns { 41 get { return dataColumns; } 42 } 43 40 44 41 45 #region Constructor, Cloning & Persistence … … 99 103 } 100 104 101 private void ColumnTypeSwitchAction<T>(int columnIndex, T value, Action<DoublePreprocessingDataColumn, double?> doubleAction,102 Action<StringPreprocessingDataColumn, string> stringAction = null, Action<DateTimePreprocessingDataColumn, DateTime?> dateTimeAction = null) {103 ColumnTypeSwitchAction(dataColumns[columnIndex], value, doubleAction, stringAction, dateTimeAction);104 }105 private void ColumnTypeSwitchAction<T>(PreprocessingDataColumn column, T value, Action<DoublePreprocessingDataColumn, double?> doubleAction,106 Action<StringPreprocessingDataColumn, string> stringAction = null, Action<DateTimePreprocessingDataColumn, DateTime?> dateTimeAction = null) {107 var doubleColumn = column as DoublePreprocessingDataColumn;108 if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn, Convert<double?>(value));109 var stringColumn = column as StringPreprocessingDataColumn;110 if (stringColumn != null && stringAction != null) stringAction(stringColumn, Convert<string>(value));111 var dateTimeColumn = column as DateTimePreprocessingDataColumn;112 if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn, Convert<DateTime?>(value));113 }114 115 private void ColumnTypeSwitchAction(int columnIndex, Action<DoublePreprocessingDataColumn> doubleAction,116 Action<StringPreprocessingDataColumn> stringAction = null, Action<DateTimePreprocessingDataColumn> dateTimeAction = null) {117 ColumnTypeSwitchAction(dataColumns[columnIndex], doubleAction, stringAction, dateTimeAction);118 }119 private void ColumnTypeSwitchAction(PreprocessingDataColumn column, Action<DoublePreprocessingDataColumn> doubleAction,120 Action<StringPreprocessingDataColumn> stringAction = null, Action<DateTimePreprocessingDataColumn> dateTimeAction = null) {121 var doubleColumn = column as DoublePreprocessingDataColumn;122 if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn);123 var stringColumn = column as StringPreprocessingDataColumn;124 if (stringColumn != null && stringAction != null) stringAction(stringColumn);125 var dateTimeColumn = column as DateTimePreprocessingDataColumn;126 if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn);127 }128 129 130 private T ColumnTypeSwitchFunc<T>(int columnIndex, Func<DoublePreprocessingDataColumn, double?> doubleFunc,131 Func<StringPreprocessingDataColumn, string> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime?> dateTimeFunc = null) {132 var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;133 if (doubleColumn != null && doubleFunc != null) return Convert<T>(doubleFunc(doubleColumn));134 var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;135 if (stringColumn != null && stringFunc != null) return Convert<T>(stringFunc(stringColumn));136 var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;137 if (dateTimeColumn != null && dateTimeFunc != null) return Convert<T>(dateTimeFunc(dateTimeColumn));138 throw new InvalidOperationException("Invalid data column type.");139 }140 141 private T ColumnTypeSwitchFuncResult<T>(int columnIndex, Func<DoublePreprocessingDataColumn, T> doubleFunc,142 Func<StringPreprocessingDataColumn, T> stringFunc = null, Func<DateTimePreprocessingDataColumn, T> dateTimeFunc = null) {143 var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;144 if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn);145 var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;146 if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn);147 var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;148 if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn);149 throw new InvalidOperationException("Invalid data column type.");150 }151 private TOut ColumnTypeSwitchFuncResult<TIn, TOut>(int columnIndex, TIn value, Func<DoublePreprocessingDataColumn, double?, TOut> doubleFunc,152 Func<StringPreprocessingDataColumn, string, TOut> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime?, TOut> dateTimeFunc = null) {153 var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;154 if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn, Convert<double?>(value));155 var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;156 if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn, Convert<string>(value));157 var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;158 if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn, Convert<DateTime?>(value));159 throw new InvalidOperationException("Invalid data column type.");160 }161 162 private IList<T> ColumnTypeSwitchFuncList<T>(int columnIndex, Func<DoublePreprocessingDataColumn, IList<double>> doubleFunc,163 Func<StringPreprocessingDataColumn, IList<string>> stringFunc = null, Func<DateTimePreprocessingDataColumn, IList<DateTime>> dateTimeFunc = null) {164 var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;165 if (doubleColumn != null && doubleFunc != null) return Convert<IList<T>>(doubleFunc(doubleColumn));166 var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;167 if (stringColumn != null && stringFunc != null) return Convert<IList<T>>(stringFunc(stringColumn));168 var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;169 if (dateTimeColumn != null && dateTimeFunc != null) return Convert<IList<T>>(dateTimeFunc(dateTimeColumn));170 throw new InvalidOperationException("Invalid data column type.");171 }172 private static T Convert<T>(object obj) { return (T)obj; }173 174 175 105 public T GetCell<T>(int columnIndex, int rowIndex) { 176 return ColumnTypeSwitchFunc<T>(columnIndex,106 return dataColumns[columnIndex].TypeSwitch<T>( 177 107 c => c[rowIndex], 178 108 c => c[rowIndex], … … 188 118 InsertColumn<T>(i.ToString(), i); 189 119 190 ColumnTypeSwitchAction<T>(columnIndex,value,120 dataColumns[columnIndex].TypeSwitch<T>(value, 191 121 (c, v) => c[rowIndex] = v, 192 122 (c, v) => c[rowIndex] = v, … … 201 131 } 202 132 203 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 204 if (considerSelection) { 205 var list = new List<T>(); 206 foreach (var rowIdx in selection[columnIndex]) { 207 list.Add(GetCell<T>(columnIndex, rowIdx)); 208 //list.Add((T)dataColumns[columnIndex][rowIdx]); 209 } 210 return list; 211 } else { 212 return ColumnTypeSwitchFuncList<T>(columnIndex, 213 c => c.Values.Select(x => x ?? double.NaN).ToList(), 214 c => c.Values, 215 c => c.Values.Select(x => x ?? DateTime.MinValue).ToList()); 216 //(IList<T>)dataColumns[columnIndex]; 217 } 218 } 219 220 public void SetValues<T>(int columnIndex, IList<T> values) { 133 public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) { 134 return dataColumns[columnIndex].TypeSwitch<T>( 135 c => c.GetValues(considerSelection ? selection[columnIndex] : null), 136 c => c.GetValues(considerSelection ? selection[columnIndex] : null), 137 c => c.GetValues(considerSelection ? selection[columnIndex] : null)); 138 } 139 140 public void SetValues<T>(int columnIndex, IEnumerable<T> values) { 221 141 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 222 142 if (VariableHasType<T>(columnIndex)) { … … 239 159 240 160 public bool SetValue(string value, int columnIndex, int rowIndex) { 241 bool valid = false; 242 if (VariableHasType<double>(columnIndex)) { 243 double val; 244 if (string.IsNullOrWhiteSpace(value)) { 245 val = double.NaN; 246 valid = true; 247 } else { 248 valid = double.TryParse(value, out val); 249 } 250 if (valid) 251 SetCell(columnIndex, rowIndex, val); 252 } else if (VariableHasType<string>(columnIndex)) { 253 valid = value != null; 254 if (valid) 255 SetCell(columnIndex, rowIndex, value); 256 } else if (VariableHasType<DateTime>(columnIndex)) { 257 DateTime date; 258 valid = DateTime.TryParse(value, out date); 259 if (valid) 260 SetCell(columnIndex, rowIndex, date); 261 } else { 262 throw new ArgumentException("column " + columnIndex + " contains a non supported type."); 263 } 161 var column = dataColumns[columnIndex]; 162 bool successful = column.SetValue(value, rowIndex); 264 163 265 164 if (!IsInTransaction) 266 165 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 267 166 268 return valid;167 return successful; 269 168 } 270 169 … … 274 173 275 174 public int Rows { 276 get { return dataColumns. Count > 0 ? dataColumns[0].Length: 0; }175 get { return dataColumns.Any() ? dataColumns.Max(c => c.Length) : 0; } 277 176 } 278 177 #endregion … … 281 180 public void InsertRow(int rowIndex) { 282 181 SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 182 283 183 foreach (var column in dataColumns) { 284 ColumnTypeSwitchAction(column, 184 column.TypeSwitch( 185 c => c.Values.Insert(rowIndex, double.NaN), 285 186 c => c.Values.Insert(rowIndex, null), 286 c => c.Values.Insert(rowIndex, null), 287 c => c.Values.Insert(rowIndex, null)); 288 //var valueType = column.GetValueType(); 289 //column.Insert(rowIndex, valueType.IsValueType ? Activator.CreateInstance(valueType) : null); 290 } 187 c => c.Values.Insert(rowIndex, DateTime.MinValue)); 188 } 189 291 190 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 292 191 TrainingPartition.End++; … … 302 201 } 303 202 } 203 304 204 if (!IsInTransaction) 305 205 OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 306 206 } 207 307 208 public void DeleteRow(int rowIndex) { 308 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 309 foreach (var column in dataColumns) { 310 ColumnTypeSwitchAction(column, 311 c => c.Values.RemoveAt(rowIndex), 312 c => c.Values.RemoveAt(rowIndex), 313 c => c.Values.RemoveAt(rowIndex)); 314 //column.RemoveAt(rowIndex); 315 } 316 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 317 TrainingPartition.End--; 318 if (TrainingPartition.End <= TestPartition.Start) { 319 TestPartition.Start--; 320 TestPartition.End--; 321 } 322 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 323 TestPartition.End--; 324 if (TestPartition.End <= TrainingPartition.Start) { 325 TestPartition.Start--; 326 TestPartition.End--; 327 } 328 } 329 if (!IsInTransaction) 330 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 331 } 332 public void DeleteRowsWithIndices(IEnumerable<int> rows) { 209 DeleteRows(new[] { rowIndex }); 210 } 211 public void DeleteRows(IEnumerable<int> rowIndices) { 333 212 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1); 334 foreach (int rowIndex in rows.OrderByDescending(x => x)) { 213 214 foreach (int rowIndex in rowIndices.OrderByDescending(x => x)) { 335 215 foreach (var column in dataColumns) { 336 ColumnTypeSwitchAction(column,216 column.TypeSwitch( 337 217 c => c.Values.RemoveAt(rowIndex), 338 218 c => c.Values.RemoveAt(rowIndex), 339 219 c => c.Values.RemoveAt(rowIndex)); 340 //column.RemoveAt(rowIndex);341 } 220 } 221 342 222 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 343 223 TrainingPartition.End--; … … 354 234 } 355 235 } 236 356 237 if (!IsInTransaction) 357 238 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1); … … 362 243 363 244 if (typeof(T) == typeof(double)) { 364 dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double ?>(null, Rows)));245 dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double>(double.NaN, Rows))); 365 246 } else if (typeof(T) == typeof(string)) { 366 dataColumns. Add(new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(null, Rows)));247 dataColumns.Insert(columnIndex, new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(string.Empty, Rows))); 367 248 } else if (typeof(T) == typeof(DateTime)) { 368 dataColumns. Add(new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime?>(null, Rows)));249 dataColumns.Insert(columnIndex, new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime>(DateTime.MinValue, Rows))); 369 250 } else { 370 251 throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime"); 371 252 } 372 253 373 //dataColumns.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows)));374 //variableNames.Insert(columnIndex, variableName);375 254 if (!IsInTransaction) 376 255 OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); … … 379 258 public void DeleteColumn(int columnIndex) { 380 259 SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); 260 381 261 dataColumns.RemoveAt(columnIndex); 382 //variableNames.RemoveAt(columnIndex); 262 383 263 if (!IsInTransaction) 384 264 OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); … … 386 266 387 267 public void RenameColumn(int columnIndex, string name) { 388 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);389 268 if (columnIndex < 0 || columnIndex > dataColumns.Count) 390 269 throw new ArgumentOutOfRangeException("columnIndex"); 270 271 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 272 391 273 dataColumns[columnIndex].Name = name; 392 274 … … 400 282 401 283 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 284 402 285 for (int i = 0; i < names.Count; i++) 403 286 dataColumns[i].Name = names[i]; … … 408 291 409 292 public bool AreAllStringColumns(IEnumerable<int> columnIndices) { 410 return columnIndices.All( x => VariableHasType<string>(x));293 return columnIndices.All(VariableHasType<string>); 411 294 } 412 295 #endregion … … 522 405 var stringColumn = dataColumns[i] as StringPreprocessingDataColumn; 523 406 var dateTimeColumn = dataColumns[i] as DateTimePreprocessingDataColumn; 524 if (doubleColumn != null) values.Add(new List<double>(doubleColumn. Values.Select(x => x ?? double.NaN)));525 else if (stringColumn != null) values.Add(new List<string>(stringColumn. Values));526 else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn. Values.Select(x => x ?? DateTime.MinValue)));407 if (doubleColumn != null) values.Add(new List<double>(doubleColumn.GetValues())); 408 else if (stringColumn != null) values.Add(new List<string>(stringColumn.GetValues())); 409 else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn.GetValues())); 527 410 else throw new InvalidOperationException("Column type not supported for export"); 528 411 } … … 638 521 #endregion 639 522 640 #region Statistics 641 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 642 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 643 return values.Any() ? values.Min() : emptyValue; 644 } 645 646 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 647 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 648 return values.Any() ? values.Max() : emptyValue; 649 } 650 651 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 652 if (typeof(T) == typeof(double)) { 653 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 654 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 655 } 656 if (typeof(T) == typeof(string)) { 657 return Convert<T>(string.Empty); 658 } 659 if (typeof(T) == typeof(DateTime)) { 660 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 661 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 662 } 663 664 throw new InvalidOperationException(typeof(T) + " not supported"); 665 } 666 667 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 668 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 669 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 670 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 671 } 672 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 673 return values.Any() ? values.Quantile(0.5) : emptyValue; 674 } 675 676 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 677 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 678 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 679 } 680 681 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 682 if (typeof(T) == typeof(double)) { 683 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 684 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 685 } 686 // For DateTime, std.dev / variance would have to be TimeSpan 687 //if (typeof(T) == typeof(DateTime)) { 688 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 689 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 690 //} 691 return default(T); 692 } 693 694 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 695 if (typeof(T) == typeof(double)) { 696 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 697 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 698 } 699 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 700 //if (typeof(T) == typeof(DateTime)) { 701 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 702 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 703 //} 704 return default(T); 705 } 706 707 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 708 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 709 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 710 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 711 } 712 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 713 return values.Any() ? values.Quantile(alpha) : emptyValue; 714 } 715 716 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 717 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 718 return values.GroupBy(x => x).Count(); 719 } 720 721 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 722 //var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn; 723 //var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn; 724 //var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn; 725 //return GetValues<T>(columnIndex, considerSelection).Where(x => 726 // doubleColumn != null ? doubleColumn.IsValidValue(Convert<double>(x)) 727 // : stringColumn != null ? stringColumn.IsValidValue(Convert<string>(x)) 728 // : dateTimeColumn != null ? dateTimeColumn.IsValidValue(Convert<DateTime>(x)) 729 // : false); 730 //!IsMissingValue(x)); 731 732 return GetValues<T>(columnIndex, considerSelection).Where(x => 733 ColumnTypeSwitchFuncResult<T, bool>(columnIndex, x, 734 (c, v) => v.HasValue && c.IsValidValue(v.Value), 735 (c, v) => c.IsValidValue(v), 736 (c, v) => v.HasValue && c.IsValidValue(v.Value) 737 )); 738 } 739 740 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 741 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 742 } 743 744 public int GetMissingValueCount() { 745 int count = 0; 746 for (int i = 0; i < Columns; ++i) { 747 count += GetMissingValueCount(i); 748 } 749 return count; 750 } 751 public int GetMissingValueCount(int columnIndex) { 752 int sum = 0; 753 for (int i = 0; i < Rows; i++) { 754 if (IsCellEmpty(columnIndex, i)) 755 sum++; 756 } 757 return sum; 758 } 759 public int GetRowMissingValueCount(int rowIndex) { 760 int sum = 0; 761 for (int i = 0; i < Columns; i++) { 762 if (IsCellEmpty(i, rowIndex)) 763 sum++; 764 } 765 return sum; 766 } 767 #endregion 768 769 #region Helpers 770 private static IList<IList> CopyVariableValues(IList<IList> original) { 771 var copy = new List<IList>(original); 772 for (int i = 0; i < original.Count; ++i) { 773 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]); 774 } 775 return copy; 776 } 777 #endregion 523 /* #region Statistics 524 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 525 try { 526 return dataColumns[columnIndex].TypeSwitch<T>( 527 col => col.GetMin(considerSelection ? Selection[columnIndex] : null), 528 col => col.GetMin(considerSelection ? Selection[columnIndex] : null), 529 col => col.GetMin(considerSelection ? Selection[columnIndex] : null)); 530 } catch (InvalidOperationException) { 531 return emptyValue; 532 } 533 } 534 535 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 536 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 537 return values.Any() ? values.Max() : emptyValue; 538 } 539 540 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 541 return 542 543 544 if (typeof(T) == typeof(double)) { 545 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 546 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 547 } 548 if (typeof(T) == typeof(string)) { 549 return Convert<T>(string.Empty); 550 } 551 if (typeof(T) == typeof(DateTime)) { 552 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 553 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 554 } 555 556 throw new InvalidOperationException(typeof(T) + " not supported"); 557 } 558 559 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 560 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 561 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 562 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 563 } 564 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 565 return values.Any() ? values.Quantile(0.5) : emptyValue; 566 } 567 568 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 569 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 570 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 571 } 572 573 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 574 if (typeof(T) == typeof(double)) { 575 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 576 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 577 } 578 // For DateTime, std.dev / variance would have to be TimeSpan 579 //if (typeof(T) == typeof(DateTime)) { 580 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 581 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 582 //} 583 return default(T); 584 } 585 586 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 587 if (typeof(T) == typeof(double)) { 588 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 589 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 590 } 591 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 592 //if (typeof(T) == typeof(DateTime)) { 593 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 594 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 595 //} 596 return default(T); 597 } 598 599 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 600 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 601 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 602 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 603 } 604 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 605 return values.Any() ? values.Quantile(alpha) : emptyValue; 606 } 607 608 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 609 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 610 return values.GroupBy(x => x).Count(); 611 } 612 613 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 614 return GetValues<T>(columnIndex, considerSelection).Where(x => 615 ColumnTypeSwitch<T, bool>(dataColumns[columnIndex], x, 616 (c, v) => c.IsValidValue(v), 617 (c, v) => c.IsValidValue(v), 618 (c, v) => c.IsValidValue(v) 619 )); 620 } 621 622 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 623 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 624 } 625 626 public int GetMissingValueCount() { 627 int count = 0; 628 for (int i = 0; i < Columns; ++i) { 629 count += GetMissingValueCount(i); 630 } 631 return count; 632 } 633 public int GetMissingValueCount(int columnIndex) { 634 int sum = 0; 635 for (int i = 0; i < Rows; i++) { 636 if (IsCellEmpty(columnIndex, i)) 637 sum++; 638 } 639 return sum; 640 } 641 public int GetRowMissingValueCount(int rowIndex) { 642 int sum = 0; 643 for (int i = 0; i < Columns; i++) { 644 if (IsCellEmpty(i, rowIndex)) 645 sum++; 646 } 647 return sum; 648 } 649 #endregion */ 778 650 } 779 651
Note: See TracChangeset
for help on using the changeset viewer.