Changeset 15283 for branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
- Timestamp:
- 07/24/17 15:17:35 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
r15274 r15283 327 327 } 328 328 329 public Type GetVariableType(int columnIndex) { 330 var listType = variableValues[columnIndex].GetType(); 331 return listType.GenericTypeArguments.Single(); 332 } 333 329 334 public IList<string> InputVariables { get; private set; } 330 335 public string TargetVariable { get; private set; } // optional … … 524 529 #endregion 525 530 531 #region Statistics 532 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 533 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 534 return values.Any() ? values.Min() : emptyValue; 535 } 536 537 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 538 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 539 return values.Any() ? values.Max() : emptyValue; 540 } 541 542 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 543 if (typeof(T) == typeof(double)) { 544 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 545 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 546 } 547 if (typeof(T) == typeof(string)) { 548 return Convert<T>(string.Empty); 549 } 550 if (typeof(T) == typeof(DateTime)) { 551 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 552 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 553 } 554 555 throw new InvalidOperationException(typeof(T) + " not supported"); 556 } 557 558 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 559 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 560 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 561 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 562 } 563 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 564 return values.Any() ? values.Quantile(0.5) : emptyValue; 565 } 566 567 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 568 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 569 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 570 } 571 572 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 573 if (typeof(T) == typeof(double)) { 574 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 575 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 576 } 577 // For DateTime, std.dev / variance would have to be TimeSpan 578 //if (typeof(T) == typeof(DateTime)) { 579 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 580 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 581 //} 582 return default(T); 583 } 584 585 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 586 if (typeof(T) == typeof(double)) { 587 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 588 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 589 } 590 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 591 //if (typeof(T) == typeof(DateTime)) { 592 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 593 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 594 //} 595 return default(T); 596 } 597 598 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 599 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 600 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 601 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 602 } 603 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 604 return values.Any() ? values.Quantile(alpha) : emptyValue; 605 } 606 607 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 608 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 609 return values.GroupBy(x => x).Count(); 610 } 611 612 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 613 return GetValues<T>(columnIndex, considerSelection).Where(x => !IsMissingValue(x)); 614 } 615 616 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 617 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 618 } 619 private static T Convert<T>(object obj) { return (T)obj; } 620 621 public int GetMissingValueCount() { 622 int count = 0; 623 for (int i = 0; i < Columns; ++i) { 624 count += GetMissingValueCount(i); 625 } 626 return count; 627 } 628 public int GetMissingValueCount(int columnIndex) { 629 int sum = 0; 630 for (int i = 0; i < Rows; i++) { 631 if (IsCellEmpty(columnIndex, i)) 632 sum++; 633 } 634 return sum; 635 } 636 public int GetRowMissingValueCount(int rowIndex) { 637 int sum = 0; 638 for (int i = 0; i < Columns; i++) { 639 if (IsCellEmpty(i, rowIndex)) 640 sum++; 641 } 642 return sum; 643 } 644 #endregion 645 526 646 #region Helpers 527 647 private static IList<IList> CopyVariableValues(IList<IList> original) { … … 534 654 #endregion 535 655 } 656 657 // Adapted from HeuristicLab.Common.EnumerableStatisticExtensions 658 internal static class EnumerableExtensions { 659 public static T Quantile<T>(this IEnumerable<T> values, double alpha) where T : IComparable<T> { 660 T[] valuesArr = values.ToArray(); 661 int n = valuesArr.Length; 662 if (n == 0) throw new InvalidOperationException("Enumeration contains no elements."); 663 664 var pos = n * alpha; 665 666 return Select((int)Math.Ceiling(pos) - 1, valuesArr); 667 668 } 669 670 private static T Select<T>(int k, T[] arr) where T : IComparable<T> { 671 int i, ir, j, l, mid, n = arr.Length; 672 T a; 673 l = 0; 674 ir = n - 1; 675 for (;;) { 676 if (ir <= l + 1) { 677 // Active partition contains 1 or 2 elements. 678 if (ir == l + 1 && arr[ir].CompareTo(arr[l]) < 0) { 679 // Case of 2 elements. 680 Swap(arr, l, ir); 681 } 682 return arr[k]; 683 } else { 684 mid = (l + ir) >> 1; // Choose median of left, center, and right elements 685 Swap(arr, mid, l + 1); // as partitioning element a. Also 686 687 if (arr[l].CompareTo(arr[ir]) > 0) { // rearrange so that arr[l] arr[ir] <= arr[l+1], 688 Swap(arr, l, ir); // . arr[ir] >= arr[l+1] 689 } 690 691 if (arr[l + 1].CompareTo(arr[ir]) > 0) { 692 Swap(arr, l + 1, ir); 693 } 694 if (arr[l].CompareTo(arr[l + 1]) > 0) { 695 Swap(arr, l, l + 1); 696 } 697 i = l + 1; // Initialize pointers for partitioning. 698 j = ir; 699 a = arr[l + 1]; // Partitioning element. 700 for (;;) { // Beginning of innermost loop. 701 do i++; while (arr[i].CompareTo(a) < 0); // Scan up to find element > a. 702 do j--; while (arr[j].CompareTo(a) > 0); // Scan down to find element < a. 703 if (j < i) break; // Pointers crossed. Partitioning complete. 704 Swap(arr, i, j); 705 } // End of innermost loop. 706 arr[l + 1] = arr[j]; // Insert partitioning element. 707 arr[j] = a; 708 if (j >= k) ir = j - 1; // Keep active the partition that contains the 709 if (j <= k) l = i; // kth element. 710 } 711 } 712 } 713 714 private static void Swap<T>(T[] arr, int i, int j) { 715 T temp = arr[i]; 716 arr[i] = arr[j]; 717 arr[j] = temp; 718 } 719 } 536 720 }
Note: See TracChangeset
for help on using the changeset viewer.