Changeset 15309


Ignore:
Timestamp:
08/07/17 09:43:58 (13 days ago)
Author:
pfleck
Message:

#2809 Worked on type-save PreprocessingDataColumns.

Location:
branches/DataPreprocessing Cleanup
Files:
14 edited

Legend:

Unmodified
Added
Removed
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing.Views/3.4/StatisticsView.cs

    r15283 r15309  
    9292      numericColumnsTextBox.Text = GetColumnCount<double>().ToString();
    9393      nominalColumnsTextBox5.Text = GetColumnCount<string>().ToString();
    94       missingValuesTextBox.Text = data.GetMissingValueCount().ToString();
    95       totalValuesTextBox.Text = (data.Rows * data.Rows - data.GetMissingValueCount()).ToString();
     94      missingValuesTextBox.Text = data.DataColumns.Sum(c => c.GetNumberOfMissingValues()).ToString();
     95      totalValuesTextBox.Text = (data.Rows * data.Rows - data.DataColumns.Sum(c => c.GetNumberOfMissingValues())).ToString();
    9696
    9797      var variableNames = Content.PreprocessingData.VariableNames.ToList();
     
    111111        for (int j = 0; j < statistics.Count; j++) {
    112112          if (horizontal)
    113             statisticsMatrix[j, i] = statistics[j];
     113            statisticsMatrix[j, i] = statistics[j].ToString();
    114114          else
    115             statisticsMatrix[i, j] = statistics[j];
     115            statisticsMatrix[i, j] = statistics[j].ToString();
    116116        }
    117117      }
     
    148148    }
    149149
    150     private List<string> GetStatistics(int varIdx) {
    151       List<string> list;
     150    private IList GetStatistics(int varIdx) {
     151      IList list;
    152152      var data = Content.PreprocessingData;
    153153      if (data.VariableHasType<double>(varIdx)) {
     
    166166    }
    167167
    168     private List<string> GetDoubleColumns(int statIdx) {
    169       var data = Content.PreprocessingData;
    170       return new List<string> {
    171         data.GetVariableType(statIdx).Name,
    172         data.GetMissingValueCount(statIdx).ToString(),
    173         data.GetMin<double>(statIdx, emptyValue: double.NaN).ToString(),
    174         data.GetMax<double>(statIdx, emptyValue: double.NaN).ToString(),
    175         data.GetMedian<double>(statIdx, emptyValue: double.NaN).ToString(),
    176         data.GetMean<double>(statIdx, emptyValue: double.NaN).ToString(),
    177         data.GetStandardDeviation<double>(statIdx, emptyValue: double.NaN).ToString(),
    178         data.GetVariance<double>(statIdx, emptyValue: double.NaN).ToString(),
    179         data.GetQuantile<double>(0.25, statIdx, emptyValue: double.NaN).ToString(),
    180         data.GetQuantile<double>(0.75, statIdx, emptyValue: double.NaN).ToString(),
    181         data.GetMode<double>(statIdx, emptyValue: double.NaN).ToString(),
    182         data.GetDistinctValues<double>(statIdx).ToString()
     168    private IList GetDoubleColumns(int statIdx) {
     169      var column = (DoublePreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx];
     170      return new List<object> {
     171        column.GetValueType().Name,
     172        column.GetNumberOfMissingValues(),
     173        column.GetMin(),
     174        column.GetMax(),
     175        column.GetMedian(),
     176        column.GetMean(),
     177        column.GetStandardDeviation(),
     178        column.GetVariance(),
     179        column.GetQuantile(0.25),
     180        column.GetQuantile(0.75),
     181        column.GetMode(),
     182        column.GetDistinctValues()
    183183      };
    184184    }
    185185
    186     private List<string> GetStringColumns(int statIdx) {
    187       var data = Content.PreprocessingData;
    188       return new List<string> {
    189         data.GetVariableType(statIdx).Name,
    190         data.GetMissingValueCount(statIdx).ToString(),
    191         "", // data.GetMin<string>(statIdx, emptyValue: string.Empty), //min
    192         "", // data.GetMax<string>(statIdx, emptyValue: string.Empty), //max
    193         "", // data.GetMedian<string>(statIdx, emptyValue: string.Empty), //median
     186    private IList GetStringColumns(int statIdx) {
     187      var column = (StringPreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx];
     188      return new List<object> {
     189        column.GetValueType().Name,
     190        column.GetNumberOfMissingValues(),
     191        "", //min
     192        "", //max
     193        "", //median
    194194        "", //average
    195195        "", //standard deviation
    196196        "", //variance
    197         "", // data.GetQuantile<string>(0.25, statIdx, emptyValue: string.Empty), //quarter percentile
    198         "", // data.GetQuantile<string>(0.75, statIdx, emptyValue: string.Empty), //three quarter percentile
    199         data.GetMode<string>(statIdx, emptyValue: string.Empty),
    200         data.GetDistinctValues<string>(statIdx).ToString()
     197        "", //quarter percentile
     198        "", //three quarter percentile
     199        column.GetMode(),
     200        column.GetDistinctValues()
    201201      };
    202202    }
    203203
    204     private List<string> GetDateTimeColumns(int statIdx) {
    205       var data = Content.PreprocessingData;
    206       return new List<string> {
    207         data.GetVariableType(statIdx).Name,
    208         data.GetMissingValueCount(statIdx).ToString(),
    209         data.GetMin<DateTime>(statIdx).ToString(),
    210         data.GetMax<DateTime>(statIdx).ToString(),
    211         data.GetMedian<DateTime>(statIdx).ToString(),
    212         data.GetMean<DateTime>(statIdx).ToString(),
    213         "", // should be of type TimeSpan //data.GetStandardDeviation<DateTime>(statIdx).ToString(),
    214         "", // should be of type TimeSpan //data.GetVariance<DateTime>(statIdx).ToString(),
    215         data.GetQuantile<DateTime>(0.25, statIdx).ToString(),
    216         data.GetQuantile<DateTime>(0.75, statIdx).ToString(),
    217         data.GetMode<DateTime>(statIdx).ToString(),
    218         data.GetDistinctValues<DateTime>(statIdx).ToString()
     204    private IList GetDateTimeColumns(int statIdx) {
     205      var column = (DateTimePreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx];
     206      return new List<object> {
     207        column.GetValueType().Name,
     208        column.GetNumberOfMissingValues(),
     209        column.GetMin(),
     210        column.GetMax(),
     211        column.GetMedian(),
     212        column.GetMean(),
     213        column.GetStandardDeviation(), 
     214        /*column.GetVariance()*/"", // variance (in ticks) is usually to high to display a valid TimeSpan or DateTime
     215        column.GetQuantile(0.25),
     216        column.GetQuantile(0.75),
     217        column.GetMode(),
     218        column.GetDistinctValues()
    219219      };
    220220    }
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/DataGridContent.cs

    r15285 r15309  
    9090
    9191    public void DeleteRows(IEnumerable<int> rows) {
    92       PreprocessingData.DeleteRowsWithIndices(rows);
     92      PreprocessingData.DeleteRows(rows);
    9393    }
    9494
     
    134134
    135135    #region Manipulations
    136     private void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, Func<int, double> doubleAggregator = null,
    137       Func<int, DateTime> dateTimeAggregator = null, Func<int, string> stringAggregator = null) {
    138       PreprocessingData.InTransaction(() => {
    139         foreach (var column in cells) {
    140           if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) {
    141             var value = doubleAggregator(column.Key);
    142             foreach (int index in column.Value)
    143               PreprocessingData.SetCell<double>(column.Key, index, value);
    144           } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) {
    145             var value = dateTimeAggregator(column.Key);
    146             foreach (int index in column.Value)
    147               PreprocessingData.SetCell<DateTime>(column.Key, index, value);
    148           } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) {
    149             var value = stringAggregator(column.Key);
    150             foreach (int index in column.Value)
    151               PreprocessingData.SetCell<string>(column.Key, index, value);
    152           }
    153         }
    154       });
    155     }
    156 
    157     private void ReplaceIndicesByValues(IDictionary<int, IList<int>> cells, Func<int, IEnumerable<double>> doubleAggregator = null,
    158       Func<int, IEnumerable<DateTime>> dateTimeAggregator = null, Func<int, IEnumerable<string>> stringAggregator = null) {
    159       PreprocessingData.InTransaction(() => {
    160         foreach (var column in cells) {
    161           if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) {
    162             var values = doubleAggregator(column.Key);
    163             foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value }))
    164               PreprocessingData.SetCell<double>(column.Key, pair.row, pair.value);
    165           } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) {
    166             var values = dateTimeAggregator(column.Key);
    167             foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value }))
    168               PreprocessingData.SetCell<DateTime>(column.Key, pair.row, pair.value);
    169           } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) {
    170             var values = stringAggregator(column.Key);
    171             foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value }))
    172               PreprocessingData.SetCell<string>(column.Key, pair.row, pair.value);
    173           }
    174         }
    175       });
    176     }
    177 
    178136    public void ReplaceIndicesByMean(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
    179       ReplaceIndicesByValue(cells,
    180         col => PreprocessingData.GetMean<double>(col, considerSelection),
    181         col => PreprocessingData.GetMean<DateTime>(col, considerSelection));
     137      PreprocessingData.InTransaction(() => {
     138        foreach (var column in cells) {
     139          PreprocessingData.DataColumns[column.Key].TypeSwitch(
     140            c => {
     141              var mean = c.GetMean(considerSelection ? column.Value : null);
     142              foreach (var index in column.Value) c[index] = mean;
     143            },
     144            dateTimeAction: c => {
     145              var mean = c.GetMean(considerSelection ? column.Value : null);
     146              foreach (var index in column.Value) c[index] = mean;
     147            });
     148        }
     149      });
    182150    }
    183151
    184152    public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
    185       ReplaceIndicesByValue(cells,
    186         col => PreprocessingData.GetMedian<double>(col, considerSelection),
    187         col => PreprocessingData.GetMedian<DateTime>(col, considerSelection));
     153      PreprocessingData.InTransaction(() => {
     154        foreach (var column in cells) {
     155          PreprocessingData.DataColumns[column.Key].TypeSwitch(
     156            c => {
     157              var median = c.GetMedian(considerSelection ? column.Value : null);
     158              foreach (var index in column.Value) c[index] = median;
     159            },
     160            c => {
     161              var median = c.GetMedian(considerSelection ? column.Value : null);
     162              foreach (var index in column.Value) c[index] = median;
     163            },
     164            c => {
     165              var median = c.GetMedian(considerSelection ? column.Value : null);
     166              foreach (var index in column.Value) c[index] = median;
     167            });
     168        }
     169      });
    188170    }
    189171
    190172    public void ReplaceIndicesByMode(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
    191       ReplaceIndicesByValue(cells,
    192         col => PreprocessingData.GetMode<double>(col, considerSelection),
    193         col => PreprocessingData.GetMode<DateTime>(col, considerSelection),
    194         col => PreprocessingData.GetMode<string>(col, considerSelection));
     173      PreprocessingData.InTransaction(() => {
     174        foreach (var column in cells) {
     175          PreprocessingData.DataColumns[column.Key].TypeSwitch(
     176            c => {
     177              var mode = c.GetMode(considerSelection ? column.Value : null);
     178              foreach (var index in column.Value) c[index] = mode;
     179            },
     180            c => {
     181              var mode = c.GetMode(considerSelection ? column.Value : null);
     182              foreach (var index in column.Value) c[index] = mode;
     183            },
     184            c => {
     185              var mode = c.GetMode(considerSelection ? column.Value : null);
     186              foreach (var index in column.Value) c[index] = mode;
     187            });
     188        }
     189      });
    195190    }
    196191
    197192    public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
    198193      var rand = new FastRandom();
    199       ReplaceIndicesByValues(cells,
    200         col => {
    201           double min = PreprocessingData.GetMin<double>(col, considerSelection);
    202           double max = PreprocessingData.GetMax<double>(col, considerSelection);
    203           double range = max - min;
    204           return cells[col].Select(_ => rand.NextDouble() * range + min);
    205         },
    206         col => {
    207           var min = PreprocessingData.GetMin<DateTime>(col, considerSelection);
    208           var max = PreprocessingData.GetMax<DateTime>(col, considerSelection);
    209           double range = (max - min).TotalSeconds;
    210           return cells[col].Select(_ => min + TimeSpan.FromSeconds(rand.NextDouble() * range));
    211         });
     194      PreprocessingData.InTransaction(() => {
     195        foreach (var column in cells) {
     196          PreprocessingData.DataColumns[column.Key].TypeSwitch(
     197            c => {
     198              double min = c.GetMin(considerSelection ? column.Value : null);
     199              double max = c.GetMax(considerSelection ? column.Value : null);
     200              double range = max - min;
     201              foreach (var index in column.Value) c[index] = min + rand.NextDouble() * range;
     202            },
     203            dateTimeAction: c => {
     204              var min = c.GetMin(considerSelection ? column.Value : null);
     205              var max = c.GetMax(considerSelection ? column.Value : null);
     206              double range = (max - min).TotalSeconds;
     207              foreach (var index in column.Value) c[index] = min + TimeSpan.FromSeconds(rand.NextDouble() * range);
     208            });
     209        }
     210      });
    212211    }
    213212
     
    216215        foreach (var column in cells) {
    217216          foreach (var rowIdx in column.Value) {
    218             PreprocessingData.SetValue(value, column.Key, rowIdx);
     217            PreprocessingData.DataColumns[column.Key].SetValue(value, rowIdx);
    219218          }
    220219        }
     
    257256      int valuesToInterpolate = nextIndex - prevIndex;
    258257
    259       if (PreprocessingData.VariableHasType<double>(column.Key)) {
    260         double prev = PreprocessingData.GetCell<double>(column.Key, prevIndex);
    261         double next = PreprocessingData.GetCell<double>(column.Key, nextIndex);
    262         double interpolationStep = (next - prev) / valuesToInterpolate;
    263 
    264         for (int i = prevIndex; i < nextIndex; ++i) {
    265           double interpolated = prev + (interpolationStep * (i - prevIndex));
    266           PreprocessingData.SetCell<double>(column.Key, i, interpolated);
    267         }
    268       } else if (PreprocessingData.VariableHasType<DateTime>(column.Key)) {
    269         DateTime prev = PreprocessingData.GetCell<DateTime>(column.Key, prevIndex);
    270         DateTime next = PreprocessingData.GetCell<DateTime>(column.Key, nextIndex);
    271         double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
    272 
    273         for (int i = prevIndex; i < nextIndex; ++i) {
    274           DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
    275           PreprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
    276         }
    277       }
     258      PreprocessingData.DataColumns[column.Key].TypeSwitch(
     259        c => {
     260          double prev = c[prevIndex];
     261          double next = c[nextIndex];
     262          double interpolationStep = (next - prev) / valuesToInterpolate;
     263          for (int i = prevIndex; i < nextIndex; i++) c[i] = prev + (interpolationStep * (i - prevIndex));
     264        },
     265        dateTimeAction: c => {
     266          var prev = c[prevIndex];
     267          var next = c[nextIndex];
     268          double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
     269          for (int i = prevIndex; i < nextIndex; i++) c[i] = prev.AddSeconds(interpolationStep * (i - prevIndex));
     270        }
     271      );
    278272    }
    279273
    280274    private int IndexOfPrevPresentValue(int columnIndex, int start) {
    281       int offset = start - 1;
    282       while (offset >= 0 && PreprocessingData.IsCellEmpty(columnIndex, offset)) {
    283         offset--;
    284       }
    285 
    286       return offset;
     275      int index = start - 1;
     276      while (index >= 0 && PreprocessingData.IsCellEmpty(columnIndex, index))
     277        index--;
     278      return index;
    287279    }
    288280
    289281    private int IndexOfNextPresentValue(int columnIndex, int start) {
    290       int offset = start + 1;
    291       while (offset < PreprocessingData.Rows && PreprocessingData.IsCellEmpty(columnIndex, offset)) {
    292         offset++;
    293       }
    294 
    295       return offset;
     282      int index = start + 1;
     283      while (index < PreprocessingData.Rows && PreprocessingData.IsCellEmpty(columnIndex, index))
     284        index++;
     285      return index;
    296286    }
    297287
     
    303293        PreprocessingData.InTransaction(() => {
    304294          // process all given ranges - e.g. TrainingPartition, TestPartition
    305           foreach (IntRange range in ranges) {
     295          foreach (var range in ranges) {
    306296            var indices = Enumerable.Range(0, PreprocessingData.Rows).ToArray();
    307297            var shuffledIndices = Enumerable.Range(range.Start, range.Size).Shuffle(random).ToArray();
     
    324314    public void ReOrderToIndices(int[] indices) {
    325315      PreprocessingData.InTransaction(() => {
    326         for (int i = 0; i < PreprocessingData.Columns; ++i) {
    327           if (PreprocessingData.VariableHasType<double>(i))
    328             ReOrderToIndices<double>(i, indices);
    329           else if (PreprocessingData.VariableHasType<string>(i))
    330             ReOrderToIndices<string>(i, indices);
    331           else if (PreprocessingData.VariableHasType<DateTime>(i))
    332             ReOrderToIndices<DateTime>(i, indices);
    333         }
    334       });
    335     }
    336 
    337     private void ReOrderToIndices<T>(int columnIndex, int[] indices) {
    338       var originalData = new List<T>(PreprocessingData.GetValues<T>(columnIndex));
    339       if (indices.Length != originalData.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
    340 
    341       for (int i = 0; i < indices.Length; i++) {
    342         T newValue = originalData[indices[i]];
    343         PreprocessingData.SetCell<T>(columnIndex, i, newValue);
    344       }
     316        foreach (var column in PreprocessingData.DataColumns) {
     317          column.TypeSwitch(
     318            c => {
     319              if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
     320              var originalData = new List<double>(c.Values);
     321              for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]];
     322            },
     323            c => {
     324              if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
     325              var originalData = new List<string>(c.Values);
     326              for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]];
     327            },
     328            c => {
     329              if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
     330              var originalData = new List<DateTime>(c.Values);
     331              for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]];
     332            });
     333        }
     334      });
    345335    }
    346336    #endregion
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/ManipulationContent.cs

    r15285 r15309  
    5757
    5858      for (int i = 0; i < PreprocessingData.Rows; ++i) {
    59         int missingCount = PreprocessingData.GetRowMissingValueCount(i);
    60         if (100f / PreprocessingData.Columns * missingCount > percent) {
     59        int missingCount = 0;
     60        for (var col = 0; col < PreprocessingData.DataColumns.Count; col++) {
     61          if (!PreprocessingData.DataColumns[col].IsValidValue(i))
     62            missingCount++;
     63        }
     64        if (100f / PreprocessingData.Columns * missingCount > percent)
    6165          rows.Add(i);
    62         }
    6366      }
    6467
     
    6972      List<int> columns = new List<int>();
    7073      for (int i = 0; i < PreprocessingData.Columns; ++i) {
    71         int missingCount = PreprocessingData.GetMissingValueCount(i);
     74        int missingCount = PreprocessingData.DataColumns[i].GetNumberOfMissingValues();
    7275        if (100f / PreprocessingData.Rows * missingCount > percent) {
    7376          columns.Add(i);
     
    8083    public List<int> ColumnsWithVarianceSmaller(double variance) {
    8184      List<int> columns = new List<int>();
    82       for (int i = 0; i < PreprocessingData.Columns; ++i) {
    83         if (PreprocessingData.VariableHasType<double>(i)) {
    84           double columnVariance = PreprocessingData.GetVariance<double>(i);
    85           if (columnVariance < variance) {
    86             columns.Add(i);
    87           }
    88         } else if (PreprocessingData.VariableHasType<DateTime>(i)) {
    89           double columnVariance = (double)PreprocessingData.GetVariance<DateTime>(i).Ticks / TimeSpan.TicksPerSecond;
    90           if (columnVariance < variance) {
    91             columns.Add(i);
    92           }
    93         }
     85
     86      for (int i = 0; i < PreprocessingData.Columns; i++) {
     87        if (PreprocessingData.DataColumns[i].TypeSwitch<bool>(
     88          c => c.GetVariance() < variance,
     89          c => false,
     90          c => c.GetVariance().Ticks / TimeSpan.TicksPerSecond < variance
     91        ))
     92          columns.Add(i);
    9493      }
     94
    9595      return columns;
    9696    }
     
    119119      PreprocessingData.InTransaction(() => {
    120120        foreach (int column in columns.OrderByDescending(x => x)) {
    121           PreprocessingData.DeleteColumn(column);
     121          PreprocessingData.DataColumns.RemoveAt(column);
    122122        }
    123123      });
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/PreprocessingChartContent.cs

    r15274 r15309  
    8282
    8383    public static DataRow CreateDataRow(IFilteredPreprocessingData preprocessingData, string variableName, DataRowVisualProperties.DataRowChartType chartType) {
    84       IList<double> values = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableName));
     84      var values = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableName));
    8585      DataRow row = new DataRow(variableName, "", values);
    8686      row.VisualProperties.ChartType = chartType;
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/ScatterPlotContent.cs

    r15274 r15309  
    2121
    2222using System;
    23 using System.Collections.Generic;
    2423using System.Linq;
    2524using HeuristicLab.Analysis;
     
    5150    #endregion
    5251
    53     public static ScatterPlot CreateScatterPlot(IFilteredPreprocessingData preprocessingData, string variableNameX, string variableNameY, string variableNameGroup = "-", LegendOrder legendOrder = LegendOrder.Alphabetically) {
     52    public static ScatterPlot CreateScatterPlot(IFilteredPreprocessingData preprocessingData, string variableNameX, string variableNameY,
     53      string variableNameGroup = "-", LegendOrder legendOrder = LegendOrder.Alphabetically) {
    5454      ScatterPlot scatterPlot = new ScatterPlot();
    5555
    56       IList<double> xValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameX));
    57       IList<double> yValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameY));
     56      var xValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameX));
     57      var yValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameY));
    5858
    5959      var points = xValues.Zip(yValues, (x, y) => new Point2D<double>(x, y)).ToList();
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/Columns/DateTimePreprocessingDataColumn.cs

    r15291 r15309  
    2929namespace HeuristicLab.DataPreprocessing {
    3030  [Item("DateTimePreprocessingDataColumn", "")]
    31   public class DateTimePreprocessingDataColumn : NullablePreprocessingDataColumn<DateTime, TimeSpan> {
     31  public class DateTimePreprocessingDataColumn : PreprocessingDataColumn<DateTime> {
    3232
    3333    #region Constructor, Cloning & Persistence
     
    3535      : base() { }
    3636    public DateTimePreprocessingDataColumn(string name, IEnumerable<DateTime> values)
    37       : base(name, values) {
    38     }
    39     public DateTimePreprocessingDataColumn(string name, IEnumerable<DateTime?> values)
    4037      : base(name, values) {
    4138    }
     
    5350    #endregion
    5451
     52    protected override DateTime DefaultValue { get { return DateTime.MinValue; } }
     53
    5554    #region Statistics
    56     public override TimeSpan GetRange() { return ValidValues.Max() - ValidValues.Min(); }
    57     public override DateTime GetMean() { return AggregateAsDouble(ValidValues, Enumerable.Average); }
    58     public override TimeSpan GetStandardDeviation() { return AggregateDistanceAsDouble(ValidValues, EnumerableStatisticExtensions.StandardDeviation); }
    59     public override TimeSpan GetVariance() { return AggregateDistanceAsDouble(ValidValues, EnumerableStatisticExtensions.Variance); }
     55    public TimeSpan GetRange(IEnumerable<int> indices = null) { return GetMax(indices) - GetMin(indices); }
     56    public DateTime GetMean(IEnumerable<int> indices = null) { return AggregateAsDouble(GetValidValues(indices), Enumerable.Average); }
     57    public TimeSpan GetStandardDeviation(IEnumerable<int> indices = null) { return AggregateDistanceAsDouble(GetValidValues(indices), EnumerableStatisticExtensions.StandardDeviation); }
     58    public TimeSpan GetVariance(IEnumerable<int> indices = null) { return AggregateDistanceAsDouble(GetValidValues(indices), EnumerableStatisticExtensions.Variance); }
    6059    #endregion
    6160
     
    6968    public override string GetValue(int index) {
    7069      var value = Values[index];
    71       return value.HasValue ? value.Value.ToString("o") : string.Empty;
     70      return IsValidValue(value) ? Values[index].ToString("o") : string.Empty;   // format "s" sortable or "o" roundtrip
    7271    }
    7372    public override bool SetValue(string value, int index) {
     
    7776        return true;
    7877      } else if (string.IsNullOrEmpty(value)) {
    79         Values[index] = null;
     78        Values[index] = DateTime.MinValue;
    8079        return true;
    8180      } else {
     
    8685
    8786    private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {
    88       return new DateTime((long)func(values.Select(x => (double)x.Ticks)));
     87      return values.Any() ? new DateTime((long)func(values.Select(x => (double)x.Ticks))) : DateTime.MinValue;
    8988    }
    9089    private static TimeSpan AggregateDistanceAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {
    91       return new TimeSpan((long)func(values.Select(x => (double)x.Ticks)));
     90      return values.Any() ? new TimeSpan((long)func(values.Select(x => (double)x.Ticks))) : TimeSpan.Zero;
    9291    }
    9392  }
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/Columns/DoublePreprocessingDataColumn.cs

    r15291 r15309  
    3131  [Item("DoublePreprocessingDataColumn", "")]
    3232  [StorableClass]
    33   public sealed class DoublePreprocessingDataColumn : NullablePreprocessingDataColumn<double, double> {
     33  public sealed class DoublePreprocessingDataColumn : PreprocessingDataColumn<double> {
    3434
    3535    #region Constructor, Cloning & Persistence
     36
    3637    public DoublePreprocessingDataColumn()
    3738      : base() { }
     39
    3840    public DoublePreprocessingDataColumn(string name, IEnumerable<double> values)
    39       : base(name, values) {
    40     }
    41     public DoublePreprocessingDataColumn(string name, IEnumerable<double?> values)
    42       : base(name, values) {
    43     }
     41      : base(name, values) { }
    4442
    4543    private DoublePreprocessingDataColumn(DoublePreprocessingDataColumn original, Cloner cloner)
    46       : base(original, cloner) {
    47     }
     44      : base(original, cloner) { }
     45
    4846    public override IDeepCloneable Clone(Cloner cloner) {
    4947      return new DoublePreprocessingDataColumn(this, cloner);
     
    5351    private DoublePreprocessingDataColumn(bool deserializing)
    5452      : base(deserializing) { }
     53
    5554    #endregion
    5655
     
    5958    }
    6059
     60    protected override double DefaultValue { get { return double.NaN; } }
     61
    6162    #region Statistics
    62     public override double GetRange() { return ValidValues.Max() - ValidValues.Min(); }
    63     public override double GetMean() { return ValidValues.Average(); }
    64     public override double GetMedian() { return ValidValues.Quantile(0.5); } // IEnumerable<doube> version is faster
    65     public override double GetStandardDeviation() { return ValidValues.StandardDeviation(); }
    66     public override double GetVariance() { return ValidValues.Variance(); }
    67     public override double GetQuantile(double alpha) { return ValidValues.Quantile(alpha); } // IEnumerable<doube> version is faster
     63    public double GetRange(IEnumerable<int> indices = null) { return GetMax(indices) - GetMin(indices); }
     64    public double GetMean(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Average(); }
     65    public override double GetMedian(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Median(); } // IEnumerable<doube> version is faster
     66    public double GetStandardDeviation(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).StandardDeviation(); }
     67    public double GetVariance(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Variance(); }
     68    public override double GetQuantile(double alpha, IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(double.NaN).Quantile(alpha); } // IEnumerable<doube> version is faster
    6869    #endregion
    6970
     
    8384    }
    8485    public override string GetValue(int index) {
    85       var value = Values[index];
    86       return value.HasValue ? value.Value.ToString("r") : string.Empty;
     86      return Values[index].ToString("r");
    8787    }
    8888    public override bool SetValue(string value, int index) {
     
    9292        return true;
    9393      } else if (string.IsNullOrEmpty(value)) {
    94         Values[index] = null;
     94        Values[index] = double.NaN;
    9595        return true;
    9696      } else {
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/Columns/PreprocessingDataColumn.cs

    r15291 r15309  
    5454    public abstract bool IsValidValue(int index);
    5555
     56    #region Column Type Switches
     57    internal void TypeSwitch(Action<DoublePreprocessingDataColumn> doubleAction, Action<StringPreprocessingDataColumn> stringAction = null, Action<DateTimePreprocessingDataColumn> dateTimeAction = null) {
     58      var doubleColumn = this as DoublePreprocessingDataColumn;
     59      if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn);
     60      var stringColumn = this as StringPreprocessingDataColumn;
     61      if (stringColumn != null && stringAction != null) stringAction(stringColumn);
     62      var dateTimeColumn = this as DateTimePreprocessingDataColumn;
     63      if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn);
     64    }
     65    internal void TypeSwitch<TIn>(TIn value, Action<DoublePreprocessingDataColumn, double> doubleAction, Action<StringPreprocessingDataColumn, string> stringAction = null, Action<DateTimePreprocessingDataColumn, DateTime> dateTimeAction = null) {
     66      var doubleColumn = this as DoublePreprocessingDataColumn;
     67      if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn, Convert<double>(value));
     68      var stringColumn = this as StringPreprocessingDataColumn;
     69      if (stringColumn != null && stringAction != null) stringAction(stringColumn, Convert<string>(value));
     70      var dateTimeColumn = this as DateTimePreprocessingDataColumn;
     71      if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn, Convert<DateTime>(value));
     72    }
     73
     74    internal TOut TypeSwitch<TOut>(Func<DoublePreprocessingDataColumn, double> doubleFunc, Func<StringPreprocessingDataColumn, string> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime> dateTimeFunc = null) {
     75      var doubleColumn = this as DoublePreprocessingDataColumn;
     76      if (doubleColumn != null && doubleFunc != null) return Convert<TOut>(doubleFunc(doubleColumn));
     77      var stringColumn = this as StringPreprocessingDataColumn;
     78      if (stringColumn != null && stringFunc != null) return Convert<TOut>(stringFunc(stringColumn));
     79      var dateTimeColumn = this as DateTimePreprocessingDataColumn;
     80      if (dateTimeColumn != null && dateTimeFunc != null) return Convert<TOut>(dateTimeFunc(dateTimeColumn));
     81      throw new InvalidOperationException("Invalid data column type.");
     82    }
     83    internal TOut TypeSwitch<TOut>(Func<DoublePreprocessingDataColumn, TOut> doubleFunc, Func<StringPreprocessingDataColumn, TOut> stringFunc = null, Func<DateTimePreprocessingDataColumn, TOut> dateTimeFunc = null) {
     84      var doubleColumn = this as DoublePreprocessingDataColumn;
     85      if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn);
     86      var stringColumn = this as StringPreprocessingDataColumn;
     87      if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn);
     88      var dateTimeColumn = this as DateTimePreprocessingDataColumn;
     89      if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn);
     90      throw new InvalidOperationException("Invalid data column type.");
     91    }
     92    internal TOut TypeSwitch<TIn, TOut>(TIn value, Func<DoublePreprocessingDataColumn, double, TOut> doubleFunc, Func<StringPreprocessingDataColumn, string, TOut> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime, TOut> dateTimeFunc = null) {
     93      var doubleColumn = this as DoublePreprocessingDataColumn;
     94      if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn, Convert<double>(value));
     95      var stringColumn = this as StringPreprocessingDataColumn;
     96      if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn, Convert<string>(value));
     97      var dateTimeColumn = this as DateTimePreprocessingDataColumn;
     98      if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn, Convert<DateTime>(value));
     99      throw new InvalidOperationException("Invalid data column type.");
     100    }
     101    internal IEnumerable<TOut> TypeSwitch<TOut>(Func<DoublePreprocessingDataColumn, IEnumerable<double>> doubleFunc, Func<StringPreprocessingDataColumn, IEnumerable<string>> stringFunc = null, Func<DateTimePreprocessingDataColumn, IEnumerable<DateTime>> dateTimeFunc = null) {
     102      var doubleColumn = this as DoublePreprocessingDataColumn;
     103      if (doubleColumn != null && doubleFunc != null) return Convert<IEnumerable<TOut>>(doubleFunc(doubleColumn));
     104      var stringColumn = this as StringPreprocessingDataColumn;
     105      if (stringColumn != null && stringFunc != null) return Convert<IEnumerable<TOut>>(stringFunc(stringColumn));
     106      var dateTimeColumn = this as DateTimePreprocessingDataColumn;
     107      if (dateTimeColumn != null && dateTimeFunc != null) return Convert<IEnumerable<TOut>>(dateTimeFunc(dateTimeColumn));
     108      throw new InvalidOperationException("Invalid data column type.");
     109    }
     110    internal IEnumerable<TOut> TypeSwitch<TOut, TIn>(TIn value, Func<DoublePreprocessingDataColumn, double, IEnumerable<double>> doubleFunc, Func<StringPreprocessingDataColumn, string, IEnumerable<string>> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime, IEnumerable<DateTime>> dateTimeFunc = null) {
     111      var doubleColumn = this as DoublePreprocessingDataColumn;
     112      if (doubleColumn != null && doubleFunc != null) return Convert<IEnumerable<TOut>>(doubleFunc(doubleColumn, Convert<double>(value)));
     113      var stringColumn = this as StringPreprocessingDataColumn;
     114      if (stringColumn != null && stringFunc != null) return Convert<IEnumerable<TOut>>(stringFunc(stringColumn, Convert<string>(value)));
     115      var dateTimeColumn = this as DateTimePreprocessingDataColumn;
     116      if (dateTimeColumn != null && dateTimeFunc != null) return Convert<IEnumerable<TOut>>(dateTimeFunc(dateTimeColumn, Convert<DateTime>(value)));
     117      throw new InvalidOperationException("Invalid data column type.");
     118    }
     119
     120    private static T Convert<T>(object obj) { return (T)obj; }
     121    #endregion
     122
     123    #region Statistics
     124    public abstract int GetDistinctValues(IEnumerable<int> indices = null);
     125    public abstract int GetNumberOfMissingValues(IEnumerable<int> indices = null);
     126    #endregion
    56127
    57128    #region String Handling
     
    64135  [Item("PreprocessingDataColumn", "")]
    65136  [StorableClass]
    66   public abstract class PreprocessingDataColumn<TValue, TDistance> : PreprocessingDataColumn
    67     where TValue : class, IComparable<TValue> {
     137  public abstract class PreprocessingDataColumn<T> : PreprocessingDataColumn
     138    where T : IComparable<T> {
    68139
    69140    #region Constructor, Cloning & Persistence
    70141    protected PreprocessingDataColumn()
    71       : this(string.Empty, Enumerable.Empty<TValue>()) { }
    72     protected PreprocessingDataColumn(string name, IEnumerable<TValue> values)
     142      : this(string.Empty, Enumerable.Empty<T>()) { }
     143    protected PreprocessingDataColumn(string name, IEnumerable<T> values)
    73144      : base(name) {
    74       Values = new List<TValue>(values);
    75     }
    76 
    77     protected PreprocessingDataColumn(PreprocessingDataColumn<TValue, TDistance> original, Cloner cloner)
     145      Values = new List<T>(values);
     146    }
     147
     148    protected PreprocessingDataColumn(PreprocessingDataColumn<T> original, Cloner cloner)
    78149      : base(original, cloner) {
    79       Values = new List<TValue>(original.Values);
     150      Values = new List<T>(original.Values);
    80151    }
    81152
     
    86157
    87158    [Storable]
    88     public List<TValue> Values { get; private set; }
    89     public IEnumerable<TValue> ValidValues {
    90       get { return Values.Where(IsValidValue); }
    91     }
     159    internal List<T> Values { get; private set; }
     160    public IEnumerable<T> GetValues(IEnumerable<int> indices = null) {
     161      return indices == null
     162        ? Values
     163        : indices.Select(index => Values[index]);
     164    }
     165    public IEnumerable<T> GetValidValues(IEnumerable<int> indices = null) {
     166      return indices == null
     167        ? Values.Where(IsValidValue)
     168        : indices.Select(index => Values[index]).Where(IsValidValue);
     169    }
     170
     171    protected abstract T DefaultValue { get; }
    92172
    93173    public override Type GetValueType() {
    94       return typeof(TValue);
     174      return typeof(T);
    95175    }
    96176
     
    99179    }
    100180
    101     public TValue this[int index] {
     181    public T this[int index] {
    102182      get { return Values[index]; }
    103183      set { Values[index] = value; }
    104184    }
    105185
    106     public virtual bool IsValidValue(TValue value) { return true; }
     186    public virtual bool IsValidValue(T value) { return true; }
    107187    public override bool IsValidValue(int index) {
    108188      return IsValidValue(Values[index]);
     
    110190
    111191    #region Statistics
    112     public virtual TValue GetMin() { return Values.Min(); }
    113     public virtual TValue GetMax() { return Values.Max(); }
    114     public abstract TDistance GetRange();
    115     public abstract TValue GetMean();
    116     public virtual TValue GetMedian() { return Values.Quantile(0.5); }
    117     public virtual TValue GetMode() { return Values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First(); }
    118     public abstract TDistance GetStandardDeviation();
    119     public abstract TDistance GetVariance();
    120     public virtual TValue GetQuantile(double alpha) { return Values.Quantile(alpha); }
    121     public virtual int GetDistinctValues() { return Values.GroupBy(x => x).Count(); }
    122     public virtual int GetNumberOfMissingValues() { return Values.Count(IsValidValue); }
    123     #endregion
    124   }
    125 
    126   [Item("NullablePreprocessingDataColumn", "")]
    127   [StorableClass]
    128   public abstract class NullablePreprocessingDataColumn<TValue, TDistance> : PreprocessingDataColumn
    129     where TValue : struct, IComparable<TValue> {
    130 
    131     #region Constructor, Cloning & Persistence
    132     protected NullablePreprocessingDataColumn()
    133       : this(string.Empty, Enumerable.Empty<TValue?>()) { }
    134     protected NullablePreprocessingDataColumn(string name, IEnumerable<TValue> values)
    135       : this(name, values.Select(x => (TValue?)x)) { }
    136     protected NullablePreprocessingDataColumn(string name, IEnumerable<TValue?> values)
    137       : base(name) {
    138       Values = new List<TValue?>(values);
    139     }
    140 
    141     protected NullablePreprocessingDataColumn(NullablePreprocessingDataColumn<TValue, TDistance> original, Cloner cloner)
    142       : base(original, cloner) {
    143       Values = new List<TValue?>(original.Values);
    144     }
    145 
    146     [StorableConstructor]
    147     protected NullablePreprocessingDataColumn(bool deserializing)
    148       : base(deserializing) { }
    149     #endregion
    150 
    151     [Storable]
    152     internal List<TValue?> Values { get; private set; }
    153     protected IEnumerable<TValue> ValidValues {
    154       get { return Values.Where(x => x.HasValue && IsValidValue(x.Value)).Select(x => x.Value); }
    155     }
    156 
    157     public override Type GetValueType() {
    158       return typeof(TValue);
    159     }
    160 
    161     public override int Length {
    162       get { return Values.Count; }
    163     }
    164 
    165     public TValue? this[int index] {
    166       get { return Values[index]; }
    167       set { Values[index] = value; }
    168     }
    169 
    170     public virtual bool IsValidValue(TValue value) { return true; }
    171     public override bool IsValidValue(int index) {
    172       var value = Values[index];
    173       return value.HasValue && IsValidValue(value.Value);
    174     }
    175 
    176     #region Statistics
    177     public virtual TValue GetMin() { return ValidValues.Min(); }
    178     public virtual TValue GetMax() { return ValidValues.Max(); }
    179     public abstract TDistance GetRange();
    180     public abstract TValue GetMean();
    181     public virtual TValue GetMedian() { return ValidValues.Quantile(0.5); }
    182     public virtual TValue GetMode() { return ValidValues.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First(); }
    183     public abstract TDistance GetStandardDeviation();
    184     public abstract TDistance GetVariance();
    185     public virtual TValue GetQuantile(double alpha) { return ValidValues.Quantile(alpha); }
    186     public virtual int GetDistinctValues() { return ValidValues.GroupBy(x => x).Count(); }
    187     public virtual int GetNumberOfMissingValues() { return Values.Count - ValidValues.Count(); }
     192
     193    public virtual T GetMin(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Min(); }
     194    public virtual T GetMax(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Max(); }
     195    public virtual T GetMedian(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Quantile(0.5); }
     196    public virtual T GetMode(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First(); }
     197    public virtual T GetQuantile(double alpha, IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Quantile(alpha); }
     198    public override int GetDistinctValues(IEnumerable<int> indices = null) { return GetValidValues(indices).GroupBy(x => x).Count(); }
     199    public override int GetNumberOfMissingValues(IEnumerable<int> indices = null) { return GetValues(indices).Count(x => !IsValidValue(x)); }
    188200    #endregion
    189201  }
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/Columns/StringPreprocessingDataColumn.cs

    r15291 r15309  
    2828  [Item("StringPreprocessingDataColumn", "")]
    2929  [StorableClass]
    30   public sealed class StringPreprocessingDataColumn : PreprocessingDataColumn<string, string> {
     30  public sealed class StringPreprocessingDataColumn : PreprocessingDataColumn<string> {
    3131
    3232    #region Constructor, Cloning & Persistence
     
    5353    }
    5454
    55     #region Statistics
    56     public override string GetRange() { return string.Empty; }
    57     public override string GetMean() { return string.Empty; }
    58     public override string GetStandardDeviation() { return string.Empty; }
    59     public override string GetVariance() { return string.Empty; }
    60     #endregion
     55    protected override string DefaultValue { get { return string.Empty; } }
    6156
    6257    #region IStringConvertibleColumn
    6358    public override bool Validate(string value, out string errorMessage) {
    64       if (value == null) {
    65         errorMessage = "Invalid Value (string must not be null)";
    66         return false;
    67       } else {
    68         errorMessage = string.Empty;
    69         return true;
    70       }
     59      errorMessage = string.Empty;
     60      return true;
    7161    }
    7262    public override string GetValue(int index) {
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/FilteredPreprocessingData.cs

    r15283 r15309  
    3838    private IPreprocessingData filteredData;
    3939
     40    public IList<PreprocessingDataColumn> DataColumns {
     41      get { return ActiveData.DataColumns; }
     42    }
     43
    4044    public IPreprocessingData ActiveData {
    4145      get { return IsFiltered ? filteredData : originalData; }
     
    8286    }
    8387
    84     public IList<T> GetValues<T>(int columnIndex, bool considerSelection) {
     88    public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) {
    8589      return ActiveData.GetValues<T>(columnIndex, considerSelection);
    8690    }
    8791
    88     public void SetValues<T>(int columnIndex, IList<T> values) {
     92    public void SetValues<T>(int columnIndex, IEnumerable<T> values) {
    8993      if (IsFiltered)
    9094        throw new InvalidOperationException("SetValues not possible while data is filtered");
     
    123127    }
    124128
    125     public void DeleteRowsWithIndices(IEnumerable<int> rows) {
     129    public void DeleteRows(IEnumerable<int> rows) {
    126130      if (IsFiltered)
    127131        throw new InvalidOperationException("DeleteRowsWithIndices not possible while data is filtered");
    128132
    129       originalData.DeleteRowsWithIndices(rows);
     133      originalData.DeleteRows(rows);
    130134    }
    131135
     
    273277    public void EndTransaction() {
    274278      originalData.EndTransaction();
    275     }
    276     #endregion
    277 
    278     #region Statistics
    279     public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    280       return ActiveData.GetMin<T>(columnIndex, considerSelection, emptyValue);
    281     }
    282     public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    283       return ActiveData.GetMax<T>(columnIndex, considerSelection, emptyValue);
    284     }
    285     public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    286       return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue);
    287     }
    288     public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
    289       return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue);
    290     }
    291     public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {
    292       return ActiveData.GetMode<T>(columnIndex, considerSelection, emptyValue);
    293     }
    294     public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    295       return ActiveData.GetStandardDeviation<T>(columnIndex, considerSelection, emptyValue);
    296     }
    297     public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    298       return ActiveData.GetVariance<T>(columnIndex, considerSelection, emptyValue);
    299     }
    300     public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
    301       return ActiveData.GetQuantile<T>(alpha, columnIndex, considerSelection, emptyValue);
    302     }
    303     public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {
    304       return ActiveData.GetDistinctValues<T>(columnIndex, considerSelection);
    305     }
    306 
    307     public int GetMissingValueCount() {
    308       return ActiveData.GetMissingValueCount();
    309     }
    310     public int GetMissingValueCount(int columnIndex) {
    311       return ActiveData.GetMissingValueCount(columnIndex);
    312     }
    313     public int GetRowMissingValueCount(int rowIndex) {
    314       return ActiveData.GetRowMissingValueCount(rowIndex);
    315279    }
    316280    #endregion
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/IPreprocessingData.cs

    r15283 r15309  
    2828namespace HeuristicLab.DataPreprocessing {
    2929  public interface IPreprocessingData : INamedItem {
     30
     31    IList<PreprocessingDataColumn> DataColumns { get; }
     32
    3033    #region Cells
    3134    bool IsCellEmpty(int columnIndex, int rowIndex);
     
    3639    string GetCellAsString(int columnIndex, int rowIndex);
    3740
    38     IList<T> GetValues<T>(int columnIndex, bool considerSelection = false);
     41    IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection = false);
    3942
    40     void SetValues<T>(int columnIndex, IList<T> values);
     43    void SetValues<T>(int columnIndex, IEnumerable<T> values);
    4144    bool SetValue(string value, int columnIndex, int rowIndex);
    4245
     
    4851    void InsertRow(int rowIndex);
    4952    void DeleteRow(int rowIndex);
    50     void DeleteRowsWithIndices(IEnumerable<int> rows);
     53    void DeleteRows(IEnumerable<int> rows);
    5154    void InsertColumn<T>(string variableName, int columnIndex);
    5255
     
    106109    void EndTransaction();
    107110    #endregion
    108 
    109     #region Statistics
    110     T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
    111     T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
    112     T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
    113     T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>;
    114     T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T>;
    115     T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
    116     T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
    117     T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>;
    118     int GetDistinctValues<T>(int columnIndex, bool considerSelection = false);
    119 
    120     int GetMissingValueCount();
    121     int GetMissingValueCount(int columnIndex);
    122     int GetRowMissingValueCount(int rowIndex);
    123     #endregion
    124111  }
    125112}
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs

    r15291 r15309  
    3636  public class PreprocessingData : NamedItem, IPreprocessingData {
    3737
    38     [Storable]
    39     protected List<PreprocessingDataColumn> dataColumns;
     38    [Storable] private List<PreprocessingDataColumn> dataColumns;
     39
     40    public IList<PreprocessingDataColumn> DataColumns {
     41      get { return dataColumns; }
     42    }
     43
    4044
    4145    #region Constructor, Cloning & Persistence
     
    99103    }
    100104
    101     private void ColumnTypeSwitchAction<T>(int columnIndex, T value, Action<DoublePreprocessingDataColumn, double?> doubleAction,
    102       Action<StringPreprocessingDataColumn, string> stringAction = null, Action<DateTimePreprocessingDataColumn, DateTime?> dateTimeAction = null) {
    103       ColumnTypeSwitchAction(dataColumns[columnIndex], value, doubleAction, stringAction, dateTimeAction);
    104     }
    105     private void ColumnTypeSwitchAction<T>(PreprocessingDataColumn column, T value, Action<DoublePreprocessingDataColumn, double?> doubleAction,
    106       Action<StringPreprocessingDataColumn, string> stringAction = null, Action<DateTimePreprocessingDataColumn, DateTime?> dateTimeAction = null) {
    107       var doubleColumn = column as DoublePreprocessingDataColumn;
    108       if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn, Convert<double?>(value));
    109       var stringColumn = column as StringPreprocessingDataColumn;
    110       if (stringColumn != null && stringAction != null) stringAction(stringColumn, Convert<string>(value));
    111       var dateTimeColumn = column as DateTimePreprocessingDataColumn;
    112       if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn, Convert<DateTime?>(value));
    113     }
    114 
    115     private void ColumnTypeSwitchAction(int columnIndex, Action<DoublePreprocessingDataColumn> doubleAction,
    116       Action<StringPreprocessingDataColumn> stringAction = null, Action<DateTimePreprocessingDataColumn> dateTimeAction = null) {
    117       ColumnTypeSwitchAction(dataColumns[columnIndex], doubleAction, stringAction, dateTimeAction);
    118     }
    119     private void ColumnTypeSwitchAction(PreprocessingDataColumn column, Action<DoublePreprocessingDataColumn> doubleAction,
    120       Action<StringPreprocessingDataColumn> stringAction = null, Action<DateTimePreprocessingDataColumn> dateTimeAction = null) {
    121       var doubleColumn = column as DoublePreprocessingDataColumn;
    122       if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn);
    123       var stringColumn = column as StringPreprocessingDataColumn;
    124       if (stringColumn != null && stringAction != null) stringAction(stringColumn);
    125       var dateTimeColumn = column as DateTimePreprocessingDataColumn;
    126       if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn);
    127     }
    128 
    129 
    130     private T ColumnTypeSwitchFunc<T>(int columnIndex, Func<DoublePreprocessingDataColumn, double?> doubleFunc,
    131       Func<StringPreprocessingDataColumn, string> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime?> dateTimeFunc = null) {
    132       var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;
    133       if (doubleColumn != null && doubleFunc != null) return Convert<T>(doubleFunc(doubleColumn));
    134       var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;
    135       if (stringColumn != null && stringFunc != null) return Convert<T>(stringFunc(stringColumn));
    136       var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;
    137       if (dateTimeColumn != null && dateTimeFunc != null) return Convert<T>(dateTimeFunc(dateTimeColumn));
    138       throw new InvalidOperationException("Invalid data column type.");
    139     }
    140 
    141     private T ColumnTypeSwitchFuncResult<T>(int columnIndex, Func<DoublePreprocessingDataColumn, T> doubleFunc,
    142       Func<StringPreprocessingDataColumn, T> stringFunc = null, Func<DateTimePreprocessingDataColumn, T> dateTimeFunc = null) {
    143       var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;
    144       if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn);
    145       var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;
    146       if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn);
    147       var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;
    148       if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn);
    149       throw new InvalidOperationException("Invalid data column type.");
    150     }
    151     private TOut ColumnTypeSwitchFuncResult<TIn, TOut>(int columnIndex, TIn value, Func<DoublePreprocessingDataColumn, double?, TOut> doubleFunc,
    152      Func<StringPreprocessingDataColumn, string, TOut> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime?, TOut> dateTimeFunc = null) {
    153       var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;
    154       if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn, Convert<double?>(value));
    155       var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;
    156       if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn, Convert<string>(value));
    157       var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;
    158       if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn, Convert<DateTime?>(value));
    159       throw new InvalidOperationException("Invalid data column type.");
    160     }
    161 
    162     private IList<T> ColumnTypeSwitchFuncList<T>(int columnIndex, Func<DoublePreprocessingDataColumn, IList<double>> doubleFunc,
    163       Func<StringPreprocessingDataColumn, IList<string>> stringFunc = null, Func<DateTimePreprocessingDataColumn, IList<DateTime>> dateTimeFunc = null) {
    164       var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;
    165       if (doubleColumn != null && doubleFunc != null) return Convert<IList<T>>(doubleFunc(doubleColumn));
    166       var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;
    167       if (stringColumn != null && stringFunc != null) return Convert<IList<T>>(stringFunc(stringColumn));
    168       var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;
    169       if (dateTimeColumn != null && dateTimeFunc != null) return Convert<IList<T>>(dateTimeFunc(dateTimeColumn));
    170       throw new InvalidOperationException("Invalid data column type.");
    171     }
    172     private static T Convert<T>(object obj) { return (T)obj; }
    173 
    174 
    175105    public T GetCell<T>(int columnIndex, int rowIndex) {
    176       return ColumnTypeSwitchFunc<T>(columnIndex,
     106      return dataColumns[columnIndex].TypeSwitch<T>(
    177107        c => c[rowIndex],
    178108        c => c[rowIndex],
     
    188118        InsertColumn<T>(i.ToString(), i);
    189119
    190       ColumnTypeSwitchAction<T>(columnIndex, value,
     120      dataColumns[columnIndex].TypeSwitch<T>(value,
    191121        (c, v) => c[rowIndex] = v,
    192122        (c, v) => c[rowIndex] = v,
     
    201131    }
    202132
    203     public IList<T> GetValues<T>(int columnIndex, bool considerSelection) {
    204       if (considerSelection) {
    205         var list = new List<T>();
    206         foreach (var rowIdx in selection[columnIndex]) {
    207           list.Add(GetCell<T>(columnIndex, rowIdx));
    208           //list.Add((T)dataColumns[columnIndex][rowIdx]);
    209         }
    210         return list;
    211       } else {
    212         return ColumnTypeSwitchFuncList<T>(columnIndex,
    213           c => c.Values.Select(x => x ?? double.NaN).ToList(),
    214           c => c.Values,
    215           c => c.Values.Select(x => x ?? DateTime.MinValue).ToList());
    216         //(IList<T>)dataColumns[columnIndex];
    217       }
    218     }
    219 
    220     public void SetValues<T>(int columnIndex, IList<T> values) {
     133    public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) {
     134      return dataColumns[columnIndex].TypeSwitch<T>(
     135        c => c.GetValues(considerSelection ? selection[columnIndex] : null),
     136        c => c.GetValues(considerSelection ? selection[columnIndex] : null),
     137        c => c.GetValues(considerSelection ? selection[columnIndex] : null));
     138    }
     139
     140    public void SetValues<T>(int columnIndex, IEnumerable<T> values) {
    221141      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
    222142      if (VariableHasType<T>(columnIndex)) {
     
    239159
    240160    public bool SetValue(string value, int columnIndex, int rowIndex) {
    241       bool valid = false;
    242       if (VariableHasType<double>(columnIndex)) {
    243         double val;
    244         if (string.IsNullOrWhiteSpace(value)) {
    245           val = double.NaN;
    246           valid = true;
    247         } else {
    248           valid = double.TryParse(value, out val);
    249         }
    250         if (valid)
    251           SetCell(columnIndex, rowIndex, val);
    252       } else if (VariableHasType<string>(columnIndex)) {
    253         valid = value != null;
    254         if (valid)
    255           SetCell(columnIndex, rowIndex, value);
    256       } else if (VariableHasType<DateTime>(columnIndex)) {
    257         DateTime date;
    258         valid = DateTime.TryParse(value, out date);
    259         if (valid)
    260           SetCell(columnIndex, rowIndex, date);
    261       } else {
    262         throw new ArgumentException("column " + columnIndex + " contains a non supported type.");
    263       }
     161      var column = dataColumns[columnIndex];
     162      bool successful = column.SetValue(value, rowIndex);
    264163
    265164      if (!IsInTransaction)
    266165        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
    267166
    268       return valid;
     167      return successful;
    269168    }
    270169
     
    274173
    275174    public int Rows {
    276       get { return dataColumns.Count > 0 ? dataColumns[0].Length : 0; }
     175      get { return dataColumns.Any() ? dataColumns.Max(c => c.Length) : 0; }
    277176    }
    278177    #endregion
     
    281180    public void InsertRow(int rowIndex) {
    282181      SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);
     182
    283183      foreach (var column in dataColumns) {
    284         ColumnTypeSwitchAction(column,
     184        column.TypeSwitch(
     185          c => c.Values.Insert(rowIndex, double.NaN),
    285186          c => c.Values.Insert(rowIndex, null),
    286           c => c.Values.Insert(rowIndex, null),
    287           c => c.Values.Insert(rowIndex, null));
    288         //var valueType = column.GetValueType();
    289         //column.Insert(rowIndex, valueType.IsValueType ? Activator.CreateInstance(valueType) : null);
    290       }
     187          c => c.Values.Insert(rowIndex, DateTime.MinValue));
     188      }
     189
    291190      if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
    292191        TrainingPartition.End++;
     
    302201        }
    303202      }
     203
    304204      if (!IsInTransaction)
    305205        OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);
    306206    }
     207
    307208    public void DeleteRow(int rowIndex) {
    308       SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);
    309       foreach (var column in dataColumns) {
    310         ColumnTypeSwitchAction(column,
    311           c => c.Values.RemoveAt(rowIndex),
    312           c => c.Values.RemoveAt(rowIndex),
    313           c => c.Values.RemoveAt(rowIndex));
    314         //column.RemoveAt(rowIndex);
    315       }
    316       if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
    317         TrainingPartition.End--;
    318         if (TrainingPartition.End <= TestPartition.Start) {
    319           TestPartition.Start--;
    320           TestPartition.End--;
    321         }
    322       } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
    323         TestPartition.End--;
    324         if (TestPartition.End <= TrainingPartition.Start) {
    325           TestPartition.Start--;
    326           TestPartition.End--;
    327         }
    328       }
    329       if (!IsInTransaction)
    330         OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);
    331     }
    332     public void DeleteRowsWithIndices(IEnumerable<int> rows) {
     209      DeleteRows(new[] { rowIndex });
     210    }
     211    public void DeleteRows(IEnumerable<int> rowIndices) {
    333212      SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1);
    334       foreach (int rowIndex in rows.OrderByDescending(x => x)) {
     213
     214      foreach (int rowIndex in rowIndices.OrderByDescending(x => x)) {
    335215        foreach (var column in dataColumns) {
    336           ColumnTypeSwitchAction(column,
     216          column.TypeSwitch(
    337217            c => c.Values.RemoveAt(rowIndex),
    338218            c => c.Values.RemoveAt(rowIndex),
    339219            c => c.Values.RemoveAt(rowIndex));
    340           //column.RemoveAt(rowIndex);
    341         }
     220        }
     221
    342222        if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
    343223          TrainingPartition.End--;
     
    354234        }
    355235      }
     236
    356237      if (!IsInTransaction)
    357238        OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1);
     
    362243
    363244      if (typeof(T) == typeof(double)) {
    364         dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double?>(null, Rows)));
     245        dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double>(double.NaN, Rows)));
    365246      } else if (typeof(T) == typeof(string)) {
    366         dataColumns.Add(new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(null, Rows)));
     247        dataColumns.Insert(columnIndex, new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(string.Empty, Rows)));
    367248      } else if (typeof(T) == typeof(DateTime)) {
    368         dataColumns.Add(new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime?>(null, Rows)));
     249        dataColumns.Insert(columnIndex, new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime>(DateTime.MinValue, Rows)));
    369250      } else {
    370251        throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime");
    371252      }
    372253
    373       //dataColumns.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows)));
    374       //variableNames.Insert(columnIndex, variableName);
    375254      if (!IsInTransaction)
    376255        OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
     
    379258    public void DeleteColumn(int columnIndex) {
    380259      SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
     260
    381261      dataColumns.RemoveAt(columnIndex);
    382       //variableNames.RemoveAt(columnIndex);
     262
    383263      if (!IsInTransaction)
    384264        OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);
     
    386266
    387267    public void RenameColumn(int columnIndex, string name) {
    388       SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
    389268      if (columnIndex < 0 || columnIndex > dataColumns.Count)
    390269        throw new ArgumentOutOfRangeException("columnIndex");
     270
     271      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
     272
    391273      dataColumns[columnIndex].Name = name;
    392274
     
    400282
    401283      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
     284
    402285      for (int i = 0; i < names.Count; i++)
    403286        dataColumns[i].Name = names[i];
     
    408291
    409292    public bool AreAllStringColumns(IEnumerable<int> columnIndices) {
    410       return columnIndices.All(x => VariableHasType<string>(x));
     293      return columnIndices.All(VariableHasType<string>);
    411294    }
    412295    #endregion
     
    522405        var stringColumn = dataColumns[i] as StringPreprocessingDataColumn;
    523406        var dateTimeColumn = dataColumns[i] as DateTimePreprocessingDataColumn;
    524         if (doubleColumn != null) values.Add(new List<double>(doubleColumn.Values.Select(x => x ?? double.NaN)));
    525         else if (stringColumn != null) values.Add(new List<string>(stringColumn.Values));
    526         else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn.Values.Select(x => x ?? DateTime.MinValue)));
     407        if (doubleColumn != null) values.Add(new List<double>(doubleColumn.GetValues()));
     408        else if (stringColumn != null) values.Add(new List<string>(stringColumn.GetValues()));
     409        else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn.GetValues()));
    527410        else throw new InvalidOperationException("Column type not supported for export");
    528411      }
     
    638521    #endregion
    639522
    640     #region Statistics
    641     public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    642       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    643       return values.Any() ? values.Min() : emptyValue;
    644     }
    645 
    646     public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    647       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    648       return values.Any() ? values.Max() : emptyValue;
    649     }
    650 
    651     public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    652       if (typeof(T) == typeof(double)) {
    653         var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    654         return values.Any() ? Convert<T>(values.Average()) : emptyValue;
    655       }
    656       if (typeof(T) == typeof(string)) {
    657         return Convert<T>(string.Empty);
    658       }
    659       if (typeof(T) == typeof(DateTime)) {
    660         var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
    661         return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue;
    662       }
    663 
    664       throw new InvalidOperationException(typeof(T) + " not supported");
    665     }
    666 
    667     public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
    668       if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
    669         var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    670         return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue;
    671       }
    672       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    673       return values.Any() ? values.Quantile(0.5) : emptyValue;
    674     }
    675 
    676     public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {
    677       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    678       return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue;
    679     }
    680 
    681     public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    682       if (typeof(T) == typeof(double)) {
    683         var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    684         return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue;
    685       }
    686       // For DateTime, std.dev / variance would have to be TimeSpan
    687       //if (typeof(T) == typeof(DateTime)) {
    688       //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
    689       //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue;
    690       //}
    691       return default(T);
    692     }
    693 
    694     public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    695       if (typeof(T) == typeof(double)) {
    696         var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    697         return values.Any() ? Convert<T>(values.Variance()) : emptyValue;
    698       }
    699       // DateTime variance often overflows long, thus the corresponding DateTime is invalid
    700       //if (typeof(T) == typeof(DateTime)) {
    701       //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
    702       //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue;
    703       //}
    704       return default(T);
    705     }
    706 
    707     public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
    708       if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
    709         var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    710         return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue;
    711       }
    712       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    713       return values.Any() ? values.Quantile(alpha) : emptyValue;
    714     }
    715 
    716     public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {
    717       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    718       return values.GroupBy(x => x).Count();
    719     }
    720 
    721     private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) {
    722       //var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;
    723       //var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;
    724       //var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;
    725       //return GetValues<T>(columnIndex, considerSelection).Where(x =>
    726       //  doubleColumn != null ? doubleColumn.IsValidValue(Convert<double>(x))
    727       //  : stringColumn != null ? stringColumn.IsValidValue(Convert<string>(x))
    728       //  : dateTimeColumn != null ? dateTimeColumn.IsValidValue(Convert<DateTime>(x))
    729       //  : false);
    730       //!IsMissingValue(x));
    731 
    732       return GetValues<T>(columnIndex, considerSelection).Where(x =>
    733         ColumnTypeSwitchFuncResult<T, bool>(columnIndex, x,
    734           (c, v) => v.HasValue && c.IsValidValue(v.Value),
    735           (c, v) => c.IsValidValue(v),
    736           (c, v) => v.HasValue && c.IsValidValue(v.Value)
    737       ));
    738     }
    739 
    740     private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {
    741       return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond));
    742     }
    743 
    744     public int GetMissingValueCount() {
    745       int count = 0;
    746       for (int i = 0; i < Columns; ++i) {
    747         count += GetMissingValueCount(i);
    748       }
    749       return count;
    750     }
    751     public int GetMissingValueCount(int columnIndex) {
    752       int sum = 0;
    753       for (int i = 0; i < Rows; i++) {
    754         if (IsCellEmpty(columnIndex, i))
    755           sum++;
    756       }
    757       return sum;
    758     }
    759     public int GetRowMissingValueCount(int rowIndex) {
    760       int sum = 0;
    761       for (int i = 0; i < Columns; i++) {
    762         if (IsCellEmpty(i, rowIndex))
    763           sum++;
    764       }
    765       return sum;
    766     }
    767     #endregion
    768 
    769     #region Helpers
    770     private static IList<IList> CopyVariableValues(IList<IList> original) {
    771       var copy = new List<IList>(original);
    772       for (int i = 0; i < original.Count; ++i) {
    773         copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]);
    774       }
    775       return copy;
    776     }
    777     #endregion
     523    /* #region Statistics
     524     public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     525       try {
     526         return dataColumns[columnIndex].TypeSwitch<T>(
     527           col => col.GetMin(considerSelection ? Selection[columnIndex] : null),
     528           col => col.GetMin(considerSelection ? Selection[columnIndex] : null),
     529           col => col.GetMin(considerSelection ? Selection[columnIndex] : null));
     530       } catch (InvalidOperationException) {
     531         return emptyValue;
     532       }
     533     }
     534
     535     public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     536       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     537       return values.Any() ? values.Max() : emptyValue;
     538     }
     539
     540     public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     541       return
     542
     543
     544       if (typeof(T) == typeof(double)) {
     545         var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     546         return values.Any() ? Convert<T>(values.Average()) : emptyValue;
     547       }
     548       if (typeof(T) == typeof(string)) {
     549         return Convert<T>(string.Empty);
     550       }
     551       if (typeof(T) == typeof(DateTime)) {
     552         var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
     553         return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue;
     554       }
     555
     556       throw new InvalidOperationException(typeof(T) + " not supported");
     557     }
     558
     559     public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
     560       if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
     561         var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     562         return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue;
     563       }
     564       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     565       return values.Any() ? values.Quantile(0.5) : emptyValue;
     566     }
     567
     568     public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {
     569       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     570       return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue;
     571     }
     572
     573     public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     574       if (typeof(T) == typeof(double)) {
     575         var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     576         return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue;
     577       }
     578       // For DateTime, std.dev / variance would have to be TimeSpan
     579       //if (typeof(T) == typeof(DateTime)) {
     580       //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
     581       //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue;
     582       //}
     583       return default(T);
     584     }
     585
     586     public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     587       if (typeof(T) == typeof(double)) {
     588         var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     589         return values.Any() ? Convert<T>(values.Variance()) : emptyValue;
     590       }
     591       // DateTime variance often overflows long, thus the corresponding DateTime is invalid
     592       //if (typeof(T) == typeof(DateTime)) {
     593       //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
     594       //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue;
     595       //}
     596       return default(T);
     597     }
     598
     599     public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
     600       if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
     601         var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     602         return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue;
     603       }
     604       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     605       return values.Any() ? values.Quantile(alpha) : emptyValue;
     606     }
     607
     608     public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {
     609       var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     610       return values.GroupBy(x => x).Count();
     611     }
     612
     613     private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) {
     614       return GetValues<T>(columnIndex, considerSelection).Where(x =>
     615         ColumnTypeSwitch<T, bool>(dataColumns[columnIndex], x,
     616           (c, v) => c.IsValidValue(v),
     617           (c, v) => c.IsValidValue(v),
     618           (c, v) => c.IsValidValue(v)
     619       ));
     620     }
     621
     622     private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {
     623       return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond));
     624     }
     625
     626     public int GetMissingValueCount() {
     627       int count = 0;
     628       for (int i = 0; i < Columns; ++i) {
     629         count += GetMissingValueCount(i);
     630       }
     631       return count;
     632     }
     633     public int GetMissingValueCount(int columnIndex) {
     634       int sum = 0;
     635       for (int i = 0; i < Rows; i++) {
     636         if (IsCellEmpty(columnIndex, i))
     637           sum++;
     638       }
     639       return sum;
     640     }
     641     public int GetRowMissingValueCount(int rowIndex) {
     642       int sum = 0;
     643       for (int i = 0; i < Columns; i++) {
     644         if (IsCellEmpty(i, rowIndex))
     645           sum++;
     646       }
     647       return sum;
     648     }
     649     #endregion  */
    778650  }
    779651
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/PreprocessingTransformator.cs

    r15270 r15309  
    8383          int colIndex = preprocessingData.GetColumnIndex(transformation.Column);
    8484          var originalData = preprocessingData.GetValues<double>(colIndex);
    85           originalColumns.Add(transformation.Column, originalData);
     85          originalColumns.Add(transformation.Column, originalData.ToList());
    8686        }
    8787      }
     
    107107    }
    108108
    109     private IEnumerable<double> ApplyDoubleTransformation(Transformation<double> transformation, IList<double> data, out bool success, out string errorMsg) {
     109    private IEnumerable<double> ApplyDoubleTransformation(Transformation<double> transformation, IEnumerable<double> data, out bool success, out string errorMsg) {
    110110      success = transformation.Check(data, out errorMsg);
    111111      // don't apply when the check fails
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/ProblemDataCreator.cs

    r15110 r15309  
    129129    }
    130130
    131     private bool IsNotConstantInputVariable(IList<double> list) {
     131    private bool IsNotConstantInputVariable(IEnumerable<double> list) {
    132132      return context.Data.TrainingPartition.End - context.Data.TrainingPartition.Start > 1 || list.Range() > 0;
    133133    }
Note: See TracChangeset for help on using the changeset viewer.