Changeset 15431


Ignore:
Timestamp:
10/25/17 12:38:12 (2 years ago)
Author:
pfleck
Message:

#2809: Removed experimental static-typed datacolumns. (reverse merge g15291, r15309)

Location:
branches/DataPreprocessing Cleanup
Files:
1 deleted
12 edited

Legend:

Unmodified
Added
Removed
  • branches/DataPreprocessing Cleanup/DataPreprocessing Cleanup.sln

    r15291 r15431  
    22Microsoft Visual Studio Solution File, Format Version 12.00
    33# Visual Studio 15
    4 VisualStudioVersion = 15.0.26430.16
     4VisualStudioVersion = 15.0.26430.15
    55MinimumVisualStudioVersion = 10.0.40219.1
    66Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HeuristicLab.DataPreprocessing-3.4", "HeuristicLab.DataPreprocessing\3.4\HeuristicLab.DataPreprocessing-3.4.csproj", "{3B90F866-70F8-43EF-A541-51819D255B7B}"
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing.Views/3.4/StatisticsView.cs

    r15309 r15431  
    9292      numericColumnsTextBox.Text = GetColumnCount<double>().ToString();
    9393      nominalColumnsTextBox5.Text = GetColumnCount<string>().ToString();
    94       missingValuesTextBox.Text = data.DataColumns.Sum(c => c.GetNumberOfMissingValues()).ToString();
    95       totalValuesTextBox.Text = (data.Rows * data.Rows - data.DataColumns.Sum(c => c.GetNumberOfMissingValues())).ToString();
     94      missingValuesTextBox.Text = data.GetMissingValueCount().ToString();
     95      totalValuesTextBox.Text = (data.Rows * data.Rows - data.GetMissingValueCount()).ToString();
    9696
    9797      var variableNames = Content.PreprocessingData.VariableNames.ToList();
     
    111111        for (int j = 0; j < statistics.Count; j++) {
    112112          if (horizontal)
    113             statisticsMatrix[j, i] = statistics[j].ToString();
     113            statisticsMatrix[j, i] = statistics[j];
    114114          else
    115             statisticsMatrix[i, j] = statistics[j].ToString();
     115            statisticsMatrix[i, j] = statistics[j];
    116116        }
    117117      }
     
    148148    }
    149149
    150     private IList GetStatistics(int varIdx) {
    151       IList list;
     150    private List<string> GetStatistics(int varIdx) {
     151      List<string> list;
    152152      var data = Content.PreprocessingData;
    153153      if (data.VariableHasType<double>(varIdx)) {
     
    166166    }
    167167
    168     private IList GetDoubleColumns(int statIdx) {
    169       var column = (DoublePreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx];
    170       return new List<object> {
    171         column.GetValueType().Name,
    172         column.GetNumberOfMissingValues(),
    173         column.GetMin(),
    174         column.GetMax(),
    175         column.GetMedian(),
    176         column.GetMean(),
    177         column.GetStandardDeviation(),
    178         column.GetVariance(),
    179         column.GetQuantile(0.25),
    180         column.GetQuantile(0.75),
    181         column.GetMode(),
    182         column.GetDistinctValues()
     168    private List<string> GetDoubleColumns(int statIdx) {
     169      var data = Content.PreprocessingData;
     170      return new List<string> {
     171        data.GetVariableType(statIdx).Name,
     172        data.GetMissingValueCount(statIdx).ToString(),
     173        data.GetMin<double>(statIdx, emptyValue: double.NaN).ToString(),
     174        data.GetMax<double>(statIdx, emptyValue: double.NaN).ToString(),
     175        data.GetMedian<double>(statIdx, emptyValue: double.NaN).ToString(),
     176        data.GetMean<double>(statIdx, emptyValue: double.NaN).ToString(),
     177        data.GetStandardDeviation<double>(statIdx, emptyValue: double.NaN).ToString(),
     178        data.GetVariance<double>(statIdx, emptyValue: double.NaN).ToString(),
     179        data.GetQuantile<double>(0.25, statIdx, emptyValue: double.NaN).ToString(),
     180        data.GetQuantile<double>(0.75, statIdx, emptyValue: double.NaN).ToString(),
     181        data.GetMode<double>(statIdx, emptyValue: double.NaN).ToString(),
     182        data.GetDistinctValues<double>(statIdx).ToString()
    183183      };
    184184    }
    185185
    186     private IList GetStringColumns(int statIdx) {
    187       var column = (StringPreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx];
    188       return new List<object> {
    189         column.GetValueType().Name,
    190         column.GetNumberOfMissingValues(),
    191         "", //min
    192         "", //max
    193         "", //median
     186    private List<string> GetStringColumns(int statIdx) {
     187      var data = Content.PreprocessingData;
     188      return new List<string> {
     189        data.GetVariableType(statIdx).Name,
     190        data.GetMissingValueCount(statIdx).ToString(),
     191        "", // data.GetMin<string>(statIdx, emptyValue: string.Empty), //min
     192        "", // data.GetMax<string>(statIdx, emptyValue: string.Empty), //max
     193        "", // data.GetMedian<string>(statIdx, emptyValue: string.Empty), //median
    194194        "", //average
    195195        "", //standard deviation
    196196        "", //variance
    197         "", //quarter percentile
    198         "", //three quarter percentile
    199         column.GetMode(),
    200         column.GetDistinctValues()
     197        "", // data.GetQuantile<string>(0.25, statIdx, emptyValue: string.Empty), //quarter percentile
     198        "", // data.GetQuantile<string>(0.75, statIdx, emptyValue: string.Empty), //three quarter percentile
     199        data.GetMode<string>(statIdx, emptyValue: string.Empty),
     200        data.GetDistinctValues<string>(statIdx).ToString()
    201201      };
    202202    }
    203203
    204     private IList GetDateTimeColumns(int statIdx) {
    205       var column = (DateTimePreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx];
    206       return new List<object> {
    207         column.GetValueType().Name,
    208         column.GetNumberOfMissingValues(),
    209         column.GetMin(),
    210         column.GetMax(),
    211         column.GetMedian(),
    212         column.GetMean(),
    213         column.GetStandardDeviation(), 
    214         /*column.GetVariance()*/"", // variance (in ticks) is usually to high to display a valid TimeSpan or DateTime
    215         column.GetQuantile(0.25),
    216         column.GetQuantile(0.75),
    217         column.GetMode(),
    218         column.GetDistinctValues()
     204    private List<string> GetDateTimeColumns(int statIdx) {
     205      var data = Content.PreprocessingData;
     206      return new List<string> {
     207        data.GetVariableType(statIdx).Name,
     208        data.GetMissingValueCount(statIdx).ToString(),
     209        data.GetMin<DateTime>(statIdx).ToString(),
     210        data.GetMax<DateTime>(statIdx).ToString(),
     211        data.GetMedian<DateTime>(statIdx).ToString(),
     212        data.GetMean<DateTime>(statIdx).ToString(),
     213        "", // should be of type TimeSpan //data.GetStandardDeviation<DateTime>(statIdx).ToString(),
     214        "", // should be of type TimeSpan //data.GetVariance<DateTime>(statIdx).ToString(),
     215        data.GetQuantile<DateTime>(0.25, statIdx).ToString(),
     216        data.GetQuantile<DateTime>(0.75, statIdx).ToString(),
     217        data.GetMode<DateTime>(statIdx).ToString(),
     218        data.GetDistinctValues<DateTime>(statIdx).ToString()
    219219      };
    220220    }
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/DataGridContent.cs

    r15309 r15431  
    9090
    9191    public void DeleteRows(IEnumerable<int> rows) {
    92       PreprocessingData.DeleteRows(rows);
     92      PreprocessingData.DeleteRowsWithIndices(rows);
    9393    }
    9494
     
    134134
    135135    #region Manipulations
     136    private void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, Func<int, double> doubleAggregator = null,
     137      Func<int, DateTime> dateTimeAggregator = null, Func<int, string> stringAggregator = null) {
     138      PreprocessingData.InTransaction(() => {
     139        foreach (var column in cells) {
     140          if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) {
     141            var value = doubleAggregator(column.Key);
     142            foreach (int index in column.Value)
     143              PreprocessingData.SetCell<double>(column.Key, index, value);
     144          } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) {
     145            var value = dateTimeAggregator(column.Key);
     146            foreach (int index in column.Value)
     147              PreprocessingData.SetCell<DateTime>(column.Key, index, value);
     148          } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) {
     149            var value = stringAggregator(column.Key);
     150            foreach (int index in column.Value)
     151              PreprocessingData.SetCell<string>(column.Key, index, value);
     152          }
     153        }
     154      });
     155    }
     156
     157    private void ReplaceIndicesByValues(IDictionary<int, IList<int>> cells, Func<int, IEnumerable<double>> doubleAggregator = null,
     158      Func<int, IEnumerable<DateTime>> dateTimeAggregator = null, Func<int, IEnumerable<string>> stringAggregator = null) {
     159      PreprocessingData.InTransaction(() => {
     160        foreach (var column in cells) {
     161          if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) {
     162            var values = doubleAggregator(column.Key);
     163            foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value }))
     164              PreprocessingData.SetCell<double>(column.Key, pair.row, pair.value);
     165          } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) {
     166            var values = dateTimeAggregator(column.Key);
     167            foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value }))
     168              PreprocessingData.SetCell<DateTime>(column.Key, pair.row, pair.value);
     169          } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) {
     170            var values = stringAggregator(column.Key);
     171            foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value }))
     172              PreprocessingData.SetCell<string>(column.Key, pair.row, pair.value);
     173          }
     174        }
     175      });
     176    }
     177
    136178    public void ReplaceIndicesByMean(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
    137       PreprocessingData.InTransaction(() => {
    138         foreach (var column in cells) {
    139           PreprocessingData.DataColumns[column.Key].TypeSwitch(
    140             c => {
    141               var mean = c.GetMean(considerSelection ? column.Value : null);
    142               foreach (var index in column.Value) c[index] = mean;
    143             },
    144             dateTimeAction: c => {
    145               var mean = c.GetMean(considerSelection ? column.Value : null);
    146               foreach (var index in column.Value) c[index] = mean;
    147             });
    148         }
    149       });
     179      ReplaceIndicesByValue(cells,
     180        col => PreprocessingData.GetMean<double>(col, considerSelection),
     181        col => PreprocessingData.GetMean<DateTime>(col, considerSelection));
    150182    }
    151183
    152184    public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
    153       PreprocessingData.InTransaction(() => {
    154         foreach (var column in cells) {
    155           PreprocessingData.DataColumns[column.Key].TypeSwitch(
    156             c => {
    157               var median = c.GetMedian(considerSelection ? column.Value : null);
    158               foreach (var index in column.Value) c[index] = median;
    159             },
    160             c => {
    161               var median = c.GetMedian(considerSelection ? column.Value : null);
    162               foreach (var index in column.Value) c[index] = median;
    163             },
    164             c => {
    165               var median = c.GetMedian(considerSelection ? column.Value : null);
    166               foreach (var index in column.Value) c[index] = median;
    167             });
    168         }
    169       });
     185      ReplaceIndicesByValue(cells,
     186        col => PreprocessingData.GetMedian<double>(col, considerSelection),
     187        col => PreprocessingData.GetMedian<DateTime>(col, considerSelection));
    170188    }
    171189
    172190    public void ReplaceIndicesByMode(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
    173       PreprocessingData.InTransaction(() => {
    174         foreach (var column in cells) {
    175           PreprocessingData.DataColumns[column.Key].TypeSwitch(
    176             c => {
    177               var mode = c.GetMode(considerSelection ? column.Value : null);
    178               foreach (var index in column.Value) c[index] = mode;
    179             },
    180             c => {
    181               var mode = c.GetMode(considerSelection ? column.Value : null);
    182               foreach (var index in column.Value) c[index] = mode;
    183             },
    184             c => {
    185               var mode = c.GetMode(considerSelection ? column.Value : null);
    186               foreach (var index in column.Value) c[index] = mode;
    187             });
    188         }
    189       });
     191      ReplaceIndicesByValue(cells,
     192        col => PreprocessingData.GetMode<double>(col, considerSelection),
     193        col => PreprocessingData.GetMode<DateTime>(col, considerSelection),
     194        col => PreprocessingData.GetMode<string>(col, considerSelection));
    190195    }
    191196
    192197    public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
    193198      var rand = new FastRandom();
    194       PreprocessingData.InTransaction(() => {
    195         foreach (var column in cells) {
    196           PreprocessingData.DataColumns[column.Key].TypeSwitch(
    197             c => {
    198               double min = c.GetMin(considerSelection ? column.Value : null);
    199               double max = c.GetMax(considerSelection ? column.Value : null);
    200               double range = max - min;
    201               foreach (var index in column.Value) c[index] = min + rand.NextDouble() * range;
    202             },
    203             dateTimeAction: c => {
    204               var min = c.GetMin(considerSelection ? column.Value : null);
    205               var max = c.GetMax(considerSelection ? column.Value : null);
    206               double range = (max - min).TotalSeconds;
    207               foreach (var index in column.Value) c[index] = min + TimeSpan.FromSeconds(rand.NextDouble() * range);
    208             });
    209         }
    210       });
     199      ReplaceIndicesByValues(cells,
     200        col => {
     201          double min = PreprocessingData.GetMin<double>(col, considerSelection);
     202          double max = PreprocessingData.GetMax<double>(col, considerSelection);
     203          double range = max - min;
     204          return cells[col].Select(_ => rand.NextDouble() * range + min);
     205        },
     206        col => {
     207          var min = PreprocessingData.GetMin<DateTime>(col, considerSelection);
     208          var max = PreprocessingData.GetMax<DateTime>(col, considerSelection);
     209          double range = (max - min).TotalSeconds;
     210          return cells[col].Select(_ => min + TimeSpan.FromSeconds(rand.NextDouble() * range));
     211        });
    211212    }
    212213
     
    215216        foreach (var column in cells) {
    216217          foreach (var rowIdx in column.Value) {
    217             PreprocessingData.DataColumns[column.Key].SetValue(value, rowIdx);
     218            PreprocessingData.SetValue(value, column.Key, rowIdx);
    218219          }
    219220        }
     
    256257      int valuesToInterpolate = nextIndex - prevIndex;
    257258
    258       PreprocessingData.DataColumns[column.Key].TypeSwitch(
    259         c => {
    260           double prev = c[prevIndex];
    261           double next = c[nextIndex];
    262           double interpolationStep = (next - prev) / valuesToInterpolate;
    263           for (int i = prevIndex; i < nextIndex; i++) c[i] = prev + (interpolationStep * (i - prevIndex));
    264         },
    265         dateTimeAction: c => {
    266           var prev = c[prevIndex];
    267           var next = c[nextIndex];
    268           double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
    269           for (int i = prevIndex; i < nextIndex; i++) c[i] = prev.AddSeconds(interpolationStep * (i - prevIndex));
    270         }
    271       );
     259      if (PreprocessingData.VariableHasType<double>(column.Key)) {
     260        double prev = PreprocessingData.GetCell<double>(column.Key, prevIndex);
     261        double next = PreprocessingData.GetCell<double>(column.Key, nextIndex);
     262        double interpolationStep = (next - prev) / valuesToInterpolate;
     263
     264        for (int i = prevIndex; i < nextIndex; ++i) {
     265          double interpolated = prev + (interpolationStep * (i - prevIndex));
     266          PreprocessingData.SetCell<double>(column.Key, i, interpolated);
     267        }
     268      } else if (PreprocessingData.VariableHasType<DateTime>(column.Key)) {
     269        DateTime prev = PreprocessingData.GetCell<DateTime>(column.Key, prevIndex);
     270        DateTime next = PreprocessingData.GetCell<DateTime>(column.Key, nextIndex);
     271        double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
     272
     273        for (int i = prevIndex; i < nextIndex; ++i) {
     274          DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
     275          PreprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
     276        }
     277      }
    272278    }
    273279
    274280    private int IndexOfPrevPresentValue(int columnIndex, int start) {
    275       int index = start - 1;
    276       while (index >= 0 && PreprocessingData.IsCellEmpty(columnIndex, index))
    277         index--;
    278       return index;
     281      int offset = start - 1;
     282      while (offset >= 0 && PreprocessingData.IsCellEmpty(columnIndex, offset)) {
     283        offset--;
     284      }
     285
     286      return offset;
    279287    }
    280288
    281289    private int IndexOfNextPresentValue(int columnIndex, int start) {
    282       int index = start + 1;
    283       while (index < PreprocessingData.Rows && PreprocessingData.IsCellEmpty(columnIndex, index))
    284         index++;
    285       return index;
     290      int offset = start + 1;
     291      while (offset < PreprocessingData.Rows && PreprocessingData.IsCellEmpty(columnIndex, offset)) {
     292        offset++;
     293      }
     294
     295      return offset;
    286296    }
    287297
     
    293303        PreprocessingData.InTransaction(() => {
    294304          // process all given ranges - e.g. TrainingPartition, TestPartition
    295           foreach (var range in ranges) {
     305          foreach (IntRange range in ranges) {
    296306            var indices = Enumerable.Range(0, PreprocessingData.Rows).ToArray();
    297307            var shuffledIndices = Enumerable.Range(range.Start, range.Size).Shuffle(random).ToArray();
     
    314324    public void ReOrderToIndices(int[] indices) {
    315325      PreprocessingData.InTransaction(() => {
    316         foreach (var column in PreprocessingData.DataColumns) {
    317           column.TypeSwitch(
    318             c => {
    319               if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
    320               var originalData = new List<double>(c.Values);
    321               for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]];
    322             },
    323             c => {
    324               if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
    325               var originalData = new List<string>(c.Values);
    326               for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]];
    327             },
    328             c => {
    329               if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
    330               var originalData = new List<DateTime>(c.Values);
    331               for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]];
    332             });
    333         }
    334       });
     326        for (int i = 0; i < PreprocessingData.Columns; ++i) {
     327          if (PreprocessingData.VariableHasType<double>(i))
     328            ReOrderToIndices<double>(i, indices);
     329          else if (PreprocessingData.VariableHasType<string>(i))
     330            ReOrderToIndices<string>(i, indices);
     331          else if (PreprocessingData.VariableHasType<DateTime>(i))
     332            ReOrderToIndices<DateTime>(i, indices);
     333        }
     334      });
     335    }
     336
     337    private void ReOrderToIndices<T>(int columnIndex, int[] indices) {
     338      var originalData = new List<T>(PreprocessingData.GetValues<T>(columnIndex));
     339      if (indices.Length != originalData.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
     340
     341      for (int i = 0; i < indices.Length; i++) {
     342        T newValue = originalData[indices[i]];
     343        PreprocessingData.SetCell<T>(columnIndex, i, newValue);
     344      }
    335345    }
    336346    #endregion
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/ManipulationContent.cs

    r15309 r15431  
    5757
    5858      for (int i = 0; i < PreprocessingData.Rows; ++i) {
    59         int missingCount = 0;
    60         for (var col = 0; col < PreprocessingData.DataColumns.Count; col++) {
    61           if (!PreprocessingData.DataColumns[col].IsValidValue(i))
    62             missingCount++;
     59        int missingCount = PreprocessingData.GetRowMissingValueCount(i);
     60        if (100f / PreprocessingData.Columns * missingCount > percent) {
     61          rows.Add(i);
    6362        }
    64         if (100f / PreprocessingData.Columns * missingCount > percent)
    65           rows.Add(i);
    6663      }
    6764
     
    7269      List<int> columns = new List<int>();
    7370      for (int i = 0; i < PreprocessingData.Columns; ++i) {
    74         int missingCount = PreprocessingData.DataColumns[i].GetNumberOfMissingValues();
     71        int missingCount = PreprocessingData.GetMissingValueCount(i);
    7572        if (100f / PreprocessingData.Rows * missingCount > percent) {
    7673          columns.Add(i);
     
    8380    public List<int> ColumnsWithVarianceSmaller(double variance) {
    8481      List<int> columns = new List<int>();
    85 
    86       for (int i = 0; i < PreprocessingData.Columns; i++) {
    87         if (PreprocessingData.DataColumns[i].TypeSwitch<bool>(
    88           c => c.GetVariance() < variance,
    89           c => false,
    90           c => c.GetVariance().Ticks / TimeSpan.TicksPerSecond < variance
    91         ))
    92           columns.Add(i);
     82      for (int i = 0; i < PreprocessingData.Columns; ++i) {
     83        if (PreprocessingData.VariableHasType<double>(i)) {
     84          double columnVariance = PreprocessingData.GetVariance<double>(i);
     85          if (columnVariance < variance) {
     86            columns.Add(i);
     87          }
     88        } else if (PreprocessingData.VariableHasType<DateTime>(i)) {
     89          double columnVariance = (double)PreprocessingData.GetVariance<DateTime>(i).Ticks / TimeSpan.TicksPerSecond;
     90          if (columnVariance < variance) {
     91            columns.Add(i);
     92          }
     93        }
    9394      }
    94 
    9595      return columns;
    9696    }
     
    119119      PreprocessingData.InTransaction(() => {
    120120        foreach (int column in columns.OrderByDescending(x => x)) {
    121           PreprocessingData.DataColumns.RemoveAt(column);
     121          PreprocessingData.DeleteColumn(column);
    122122        }
    123123      });
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/PreprocessingChartContent.cs

    r15309 r15431  
    8282
    8383    public static DataRow CreateDataRow(IFilteredPreprocessingData preprocessingData, string variableName, DataRowVisualProperties.DataRowChartType chartType) {
    84       var values = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableName));
     84      IList<double> values = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableName));
    8585      DataRow row = new DataRow(variableName, "", values);
    8686      row.VisualProperties.ChartType = chartType;
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/ScatterPlotContent.cs

    r15309 r15431  
    2121
    2222using System;
     23using System.Collections.Generic;
    2324using System.Linq;
    2425using HeuristicLab.Analysis;
     
    5051    #endregion
    5152
    52     public static ScatterPlot CreateScatterPlot(IFilteredPreprocessingData preprocessingData, string variableNameX, string variableNameY,
    53       string variableNameGroup = "-", LegendOrder legendOrder = LegendOrder.Alphabetically) {
     53    public static ScatterPlot CreateScatterPlot(IFilteredPreprocessingData preprocessingData, string variableNameX, string variableNameY, string variableNameGroup = "-", LegendOrder legendOrder = LegendOrder.Alphabetically) {
    5454      ScatterPlot scatterPlot = new ScatterPlot();
    5555
    56       var xValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameX));
    57       var yValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameY));
     56      IList<double> xValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameX));
     57      IList<double> yValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameY));
    5858
    5959      var points = xValues.Zip(yValues, (x, y) => new Point2D<double>(x, y)).ToList();
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/FilteredPreprocessingData.cs

    r15309 r15431  
    3838    private IPreprocessingData filteredData;
    3939
    40     public IList<PreprocessingDataColumn> DataColumns {
    41       get { return ActiveData.DataColumns; }
    42     }
    43 
    4440    public IPreprocessingData ActiveData {
    4541      get { return IsFiltered ? filteredData : originalData; }
     
    8682    }
    8783
    88     public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) {
     84    public IList<T> GetValues<T>(int columnIndex, bool considerSelection) {
    8985      return ActiveData.GetValues<T>(columnIndex, considerSelection);
    9086    }
    9187
    92     public void SetValues<T>(int columnIndex, IEnumerable<T> values) {
     88    public void SetValues<T>(int columnIndex, IList<T> values) {
    9389      if (IsFiltered)
    9490        throw new InvalidOperationException("SetValues not possible while data is filtered");
     
    127123    }
    128124
    129     public void DeleteRows(IEnumerable<int> rows) {
     125    public void DeleteRowsWithIndices(IEnumerable<int> rows) {
    130126      if (IsFiltered)
    131127        throw new InvalidOperationException("DeleteRowsWithIndices not possible while data is filtered");
    132128
    133       originalData.DeleteRows(rows);
     129      originalData.DeleteRowsWithIndices(rows);
    134130    }
    135131
     
    277273    public void EndTransaction() {
    278274      originalData.EndTransaction();
     275    }
     276    #endregion
     277
     278    #region Statistics
     279    public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     280      return ActiveData.GetMin<T>(columnIndex, considerSelection, emptyValue);
     281    }
     282    public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     283      return ActiveData.GetMax<T>(columnIndex, considerSelection, emptyValue);
     284    }
     285    public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     286      return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue);
     287    }
     288    public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
     289      return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue);
     290    }
     291    public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {
     292      return ActiveData.GetMode<T>(columnIndex, considerSelection, emptyValue);
     293    }
     294    public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     295      return ActiveData.GetStandardDeviation<T>(columnIndex, considerSelection, emptyValue);
     296    }
     297    public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     298      return ActiveData.GetVariance<T>(columnIndex, considerSelection, emptyValue);
     299    }
     300    public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
     301      return ActiveData.GetQuantile<T>(alpha, columnIndex, considerSelection, emptyValue);
     302    }
     303    public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {
     304      return ActiveData.GetDistinctValues<T>(columnIndex, considerSelection);
     305    }
     306
     307    public int GetMissingValueCount() {
     308      return ActiveData.GetMissingValueCount();
     309    }
     310    public int GetMissingValueCount(int columnIndex) {
     311      return ActiveData.GetMissingValueCount(columnIndex);
     312    }
     313    public int GetRowMissingValueCount(int rowIndex) {
     314      return ActiveData.GetRowMissingValueCount(rowIndex);
    279315    }
    280316    #endregion
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/IPreprocessingData.cs

    r15309 r15431  
    2828namespace HeuristicLab.DataPreprocessing {
    2929  public interface IPreprocessingData : INamedItem {
    30 
    31     IList<PreprocessingDataColumn> DataColumns { get; }
    32 
    3330    #region Cells
    3431    bool IsCellEmpty(int columnIndex, int rowIndex);
     
    3936    string GetCellAsString(int columnIndex, int rowIndex);
    4037
    41     IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection = false);
     38    IList<T> GetValues<T>(int columnIndex, bool considerSelection = false);
    4239
    43     void SetValues<T>(int columnIndex, IEnumerable<T> values);
     40    void SetValues<T>(int columnIndex, IList<T> values);
    4441    bool SetValue(string value, int columnIndex, int rowIndex);
    4542
     
    5148    void InsertRow(int rowIndex);
    5249    void DeleteRow(int rowIndex);
    53     void DeleteRows(IEnumerable<int> rows);
     50    void DeleteRowsWithIndices(IEnumerable<int> rows);
    5451    void InsertColumn<T>(string variableName, int columnIndex);
    5552
     
    109106    void EndTransaction();
    110107    #endregion
     108
     109    #region Statistics
     110    T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
     111    T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
     112    T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
     113    T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>;
     114    T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T>;
     115    T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
     116    T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));
     117    T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>;
     118    int GetDistinctValues<T>(int columnIndex, bool considerSelection = false);
     119
     120    int GetMissingValueCount();
     121    int GetMissingValueCount(int columnIndex);
     122    int GetRowMissingValueCount(int rowIndex);
     123    #endregion
    111124  }
    112125}
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs

    r15309 r15431  
    3232
    3333namespace HeuristicLab.DataPreprocessing {
     34
    3435  [Item("PreprocessingData", "Represents data used for preprocessing.")]
    3536  [StorableClass]
    3637  public class PreprocessingData : NamedItem, IPreprocessingData {
    3738
    38     [Storable] private List<PreprocessingDataColumn> dataColumns;
    39 
    40     public IList<PreprocessingDataColumn> DataColumns {
    41       get { return dataColumns; }
    42     }
    43 
     39    [Storable]
     40    protected IList<IList> variableValues;
     41    [Storable]
     42    protected IList<string> variableNames;
    4443
    4544    #region Constructor, Cloning & Persistence
     
    4847      Name = "Preprocessing Data";
    4948
    50       dataColumns = new List<PreprocessingDataColumn>();
    5149      Transformations = new List<ITransformation>();
    5250      selection = new Dictionary<int, IList<int>>();
     
    5957    protected PreprocessingData(PreprocessingData original, Cloner cloner)
    6058      : base(original, cloner) {
    61       dataColumns = new List<PreprocessingDataColumn>(original.dataColumns.Select(cloner.Clone));
    62       TrainingPartition = cloner.Clone(original.TrainingPartition);
    63       TestPartition = cloner.Clone(original.TestPartition);
     59      variableValues = CopyVariableValues(original.variableValues);
     60      variableNames = new List<string>(original.variableNames);
     61      TrainingPartition = (IntRange)original.TrainingPartition.Clone(cloner);
     62      TestPartition = (IntRange)original.TestPartition.Clone(cloner);
    6463      Transformations = new List<ITransformation>(original.Transformations.Select(cloner.Clone));
    6564
     
    10099    #region Cells
    101100    public bool IsCellEmpty(int columnIndex, int rowIndex) {
    102       return !dataColumns[columnIndex].IsValidValue(rowIndex);
     101      var value = variableValues[columnIndex][rowIndex];
     102      return IsMissingValue(value);
    103103    }
    104104
    105105    public T GetCell<T>(int columnIndex, int rowIndex) {
    106       return dataColumns[columnIndex].TypeSwitch<T>(
    107         c => c[rowIndex],
    108         c => c[rowIndex],
    109         c => c[rowIndex]);
     106      return (T)variableValues[columnIndex][rowIndex];
    110107    }
    111108
     
    118115        InsertColumn<T>(i.ToString(), i);
    119116
    120       dataColumns[columnIndex].TypeSwitch<T>(value,
    121         (c, v) => c[rowIndex] = v,
    122         (c, v) => c[rowIndex] = v,
    123         (c, v) => c[rowIndex] = v);
    124 
     117      variableValues[columnIndex][rowIndex] = value;
    125118      if (!IsInTransaction)
    126119        OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex);
     
    128121
    129122    public string GetCellAsString(int columnIndex, int rowIndex) {
    130       return dataColumns[columnIndex].GetValue(rowIndex);
    131     }
    132 
    133     public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) {
    134       return dataColumns[columnIndex].TypeSwitch<T>(
    135         c => c.GetValues(considerSelection ? selection[columnIndex] : null),
    136         c => c.GetValues(considerSelection ? selection[columnIndex] : null),
    137         c => c.GetValues(considerSelection ? selection[columnIndex] : null));
    138     }
    139 
    140     public void SetValues<T>(int columnIndex, IEnumerable<T> values) {
     123      return variableValues[columnIndex][rowIndex].ToString();
     124    }
     125
     126    public IList<T> GetValues<T>(int columnIndex, bool considerSelection) {
     127      if (considerSelection) {
     128        var list = new List<T>();
     129        foreach (var rowIdx in selection[columnIndex]) {
     130          list.Add((T)variableValues[columnIndex][rowIdx]);
     131        }
     132        return list;
     133      } else {
     134        return (IList<T>)variableValues[columnIndex];
     135      }
     136    }
     137
     138    public void SetValues<T>(int columnIndex, IList<T> values) {
    141139      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
    142140      if (VariableHasType<T>(columnIndex)) {
    143         var name = dataColumns[columnIndex].Name;
    144         if (dataColumns[columnIndex].IsType<double>()) {
    145           dataColumns[columnIndex] = new DoublePreprocessingDataColumn(name, (IEnumerable<double>)values);
    146         } else if (dataColumns[columnIndex].IsType<string>()) {
    147           dataColumns[columnIndex] = new StringPreprocessingDataColumn(name, (IEnumerable<string>)values);
    148         } else if (dataColumns[columnIndex].IsType<DateTime>()) {
    149           dataColumns[columnIndex] = new DateTimePreprocessingDataColumn(name, (IEnumerable<DateTime>)values);
     141        variableValues[columnIndex] = (IList)values;
     142      } else {
     143        throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + variableValues[columnIndex].GetType().Name + " but was " + typeof(T).Name);
     144      }
     145      if (!IsInTransaction)
     146        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
     147    }
     148
     149    public bool SetValue(string value, int columnIndex, int rowIndex) {
     150      bool valid = false;
     151      if (VariableHasType<double>(columnIndex)) {
     152        double val;
     153        if (string.IsNullOrWhiteSpace(value)) {
     154          val = double.NaN;
     155          valid = true;
    150156        } else {
    151           throw new ArgumentException("Unknown column type");
    152         }
     157          valid = double.TryParse(value, out val);
     158        }
     159        if (valid)
     160          SetCell(columnIndex, rowIndex, val);
     161      } else if (VariableHasType<string>(columnIndex)) {
     162        valid = value != null;
     163        if (valid)
     164          SetCell(columnIndex, rowIndex, value);
     165      } else if (VariableHasType<DateTime>(columnIndex)) {
     166        DateTime date;
     167        valid = DateTime.TryParse(value, out date);
     168        if (valid)
     169          SetCell(columnIndex, rowIndex, date);
    153170      } else {
    154         throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + dataColumns[columnIndex].GetType().Name + " but was " + typeof(T).Name);
    155       }
     171        throw new ArgumentException("column " + columnIndex + " contains a non supported type.");
     172      }
     173
    156174      if (!IsInTransaction)
    157175        OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
    158     }
    159 
    160     public bool SetValue(string value, int columnIndex, int rowIndex) {
    161       var column = dataColumns[columnIndex];
    162       bool successful = column.SetValue(value, rowIndex);
    163 
    164       if (!IsInTransaction)
    165         OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
    166 
    167       return successful;
     176
     177      return valid;
    168178    }
    169179
    170180    public int Columns {
    171       get { return dataColumns.Count; }
     181      get { return variableNames.Count; }
    172182    }
    173183
    174184    public int Rows {
    175       get { return dataColumns.Any() ? dataColumns.Max(c => c.Length) : 0; }
     185      get { return variableValues.Count > 0 ? variableValues[0].Count : 0; }
     186    }
     187
     188    public static bool IsMissingValue(object value) {
     189      if (value is double) return double.IsNaN((double)value);
     190      if (value is string) return string.IsNullOrEmpty((string)value);
     191      if (value is DateTime) return ((DateTime)value).Equals(DateTime.MinValue);
     192      throw new ArgumentException();
    176193    }
    177194    #endregion
     
    180197    public void InsertRow(int rowIndex) {
    181198      SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);
    182 
    183       foreach (var column in dataColumns) {
    184         column.TypeSwitch(
    185           c => c.Values.Insert(rowIndex, double.NaN),
    186           c => c.Values.Insert(rowIndex, null),
    187           c => c.Values.Insert(rowIndex, DateTime.MinValue));
    188       }
    189 
     199      foreach (IList column in variableValues) {
     200        Type type = column.GetType().GetGenericArguments()[0];
     201        column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null);
     202      }
    190203      if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
    191204        TrainingPartition.End++;
     
    201214        }
    202215      }
    203 
    204216      if (!IsInTransaction)
    205217        OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);
    206218    }
    207 
    208219    public void DeleteRow(int rowIndex) {
    209       DeleteRows(new[] { rowIndex });
    210     }
    211     public void DeleteRows(IEnumerable<int> rowIndices) {
     220      SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex);
     221      foreach (IList column in variableValues) {
     222        column.RemoveAt(rowIndex);
     223      }
     224      if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
     225        TrainingPartition.End--;
     226        if (TrainingPartition.End <= TestPartition.Start) {
     227          TestPartition.Start--;
     228          TestPartition.End--;
     229        }
     230      } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) {
     231        TestPartition.End--;
     232        if (TestPartition.End <= TrainingPartition.Start) {
     233          TestPartition.Start--;
     234          TestPartition.End--;
     235        }
     236      }
     237      if (!IsInTransaction)
     238        OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex);
     239    }
     240    public void DeleteRowsWithIndices(IEnumerable<int> rows) {
    212241      SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1);
    213 
    214       foreach (int rowIndex in rowIndices.OrderByDescending(x => x)) {
    215         foreach (var column in dataColumns) {
    216           column.TypeSwitch(
    217             c => c.Values.RemoveAt(rowIndex),
    218             c => c.Values.RemoveAt(rowIndex),
    219             c => c.Values.RemoveAt(rowIndex));
    220         }
    221 
     242      foreach (int rowIndex in rows.OrderByDescending(x => x)) {
     243        foreach (IList column in variableValues) {
     244          column.RemoveAt(rowIndex);
     245        }
    222246        if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) {
    223247          TrainingPartition.End--;
     
    234258        }
    235259      }
    236 
    237260      if (!IsInTransaction)
    238261        OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1);
     
    241264    public void InsertColumn<T>(string variableName, int columnIndex) {
    242265      SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);
    243 
    244       if (typeof(T) == typeof(double)) {
    245         dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double>(double.NaN, Rows)));
    246       } else if (typeof(T) == typeof(string)) {
    247         dataColumns.Insert(columnIndex, new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(string.Empty, Rows)));
    248       } else if (typeof(T) == typeof(DateTime)) {
    249         dataColumns.Insert(columnIndex, new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime>(DateTime.MinValue, Rows)));
    250       } else {
    251         throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime");
    252       }
    253 
     266      variableValues.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows)));
     267      variableNames.Insert(columnIndex, variableName);
    254268      if (!IsInTransaction)
    255269        OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
     
    258272    public void DeleteColumn(int columnIndex) {
    259273      SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1);
    260 
    261       dataColumns.RemoveAt(columnIndex);
    262 
     274      variableValues.RemoveAt(columnIndex);
     275      variableNames.RemoveAt(columnIndex);
    263276      if (!IsInTransaction)
    264277        OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1);
     
    266279
    267280    public void RenameColumn(int columnIndex, string name) {
    268       if (columnIndex < 0 || columnIndex > dataColumns.Count)
     281      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
     282      if (columnIndex < 0 || columnIndex > variableNames.Count)
    269283        throw new ArgumentOutOfRangeException("columnIndex");
    270 
    271       SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);
    272 
    273       dataColumns[columnIndex].Name = name;
     284      variableNames[columnIndex] = name;
    274285
    275286      if (!IsInTransaction)
     
    279290    public void RenameColumns(IList<string> names) {
    280291      if (names == null) throw new ArgumentNullException("names");
    281       if (names.Count != dataColumns.Count) throw new ArgumentException("number of names must match the number of columns.", "names");
     292      if (names.Count != variableNames.Count) throw new ArgumentException("number of names must match the number of columns.", "names");
    282293
    283294      SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1);
    284 
    285295      for (int i = 0; i < names.Count; i++)
    286         dataColumns[i].Name = names[i];
     296        variableNames[i] = names[i];
    287297
    288298      if (!IsInTransaction)
     
    291301
    292302    public bool AreAllStringColumns(IEnumerable<int> columnIndices) {
    293       return columnIndices.All(VariableHasType<string>);
     303      return columnIndices.All(x => VariableHasType<string>(x));
    294304    }
    295305    #endregion
     
    297307    #region Variables
    298308    public IEnumerable<string> VariableNames {
    299       get { return dataColumns.Select(c => c.Name); }
     309      get { return variableNames; }
    300310    }
    301311
    302312    public IEnumerable<string> GetDoubleVariableNames() {
    303       return dataColumns.OfType<DoublePreprocessingDataColumn>().Select(c => c.Name);
     313      var doubleVariableNames = new List<string>();
     314      for (int i = 0; i < Columns; ++i) {
     315        if (VariableHasType<double>(i)) {
     316          doubleVariableNames.Add(variableNames[i]);
     317        }
     318      }
     319      return doubleVariableNames;
    304320    }
    305321
    306322    public string GetVariableName(int columnIndex) {
    307       return dataColumns[columnIndex].Name;
     323      return variableNames[columnIndex];
    308324    }
    309325
    310326    public int GetColumnIndex(string variableName) {
    311       return dataColumns.FindIndex(c => c.Name == variableName);
     327      return variableNames.IndexOf(variableName);
    312328    }
    313329
    314330    public bool VariableHasType<T>(int columnIndex) {
    315       return dataColumns[columnIndex].IsType<T>();
     331      return columnIndex >= variableValues.Count || variableValues[columnIndex] is List<T>;
    316332    }
    317333
    318334    public Type GetVariableType(int columnIndex) {
    319       return dataColumns[columnIndex].GetValueType();
     335      var listType = variableValues[columnIndex].GetType();
     336      return listType.GenericTypeArguments.Single();
    320337    }
    321338
     
    375392    #region Import & Export
    376393    public void Import(IDataAnalysisProblemData problemData) {
    377       var dataset = problemData.Dataset;
     394      Dataset dataset = (Dataset)problemData.Dataset;
     395      variableNames = new List<string>(problemData.Dataset.VariableNames);
    378396      InputVariables = new List<string>(problemData.AllowedInputVariables);
    379       TargetVariable = problemData is IRegressionProblemData ? ((IRegressionProblemData)problemData).TargetVariable
    380         : problemData is IClassificationProblemData ? ((IClassificationProblemData)problemData).TargetVariable
    381         : null;
    382 
    383       dataColumns.Clear();
     397      TargetVariable = (problemData is IRegressionProblemData) ? ((IRegressionProblemData)problemData).TargetVariable
     398        : (problemData is IClassificationProblemData) ? ((IClassificationProblemData)problemData).TargetVariable
     399          : null;
     400
     401      int columnIndex = 0;
     402      variableValues = new List<IList>();
    384403      foreach (var variableName in problemData.Dataset.VariableNames) {
    385404        if (dataset.VariableHasType<double>(variableName)) {
    386           dataColumns.Add(new DoublePreprocessingDataColumn(variableName, dataset.GetDoubleValues(variableName)));
     405          variableValues.Insert(columnIndex, dataset.GetDoubleValues(variableName).ToList());
    387406        } else if (dataset.VariableHasType<string>(variableName)) {
    388           dataColumns.Add(new StringPreprocessingDataColumn(variableName, dataset.GetStringValues(variableName)));
     407          variableValues.Insert(columnIndex, dataset.GetStringValues(variableName).ToList());
    389408        } else if (dataset.VariableHasType<DateTime>(variableName)) {
    390           dataColumns.Add(new DateTimePreprocessingDataColumn(variableName, dataset.GetDateTimeValues(variableName)));
     409          variableValues.Insert(columnIndex, dataset.GetDateTimeValues(variableName).ToList());
    391410        } else {
    392411          throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime");
    393412        }
     413        ++columnIndex;
    394414      }
    395415
     
    401421      IList<IList> values = new List<IList>();
    402422
    403       for (int i = 0; i < Columns; i++) {
    404         var doubleColumn = dataColumns[i] as DoublePreprocessingDataColumn;
    405         var stringColumn = dataColumns[i] as StringPreprocessingDataColumn;
    406         var dateTimeColumn = dataColumns[i] as DateTimePreprocessingDataColumn;
    407         if (doubleColumn != null) values.Add(new List<double>(doubleColumn.GetValues()));
    408         else if (stringColumn != null) values.Add(new List<string>(stringColumn.GetValues()));
    409         else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn.GetValues()));
    410         else throw new InvalidOperationException("Column type not supported for export");
    411       }
    412 
    413       return new Dataset(VariableNames, values);
     423      for (int i = 0; i < Columns; ++i) {
     424        values.Add(variableValues[i]);
     425      }
     426
     427      var dataset = new Dataset(variableNames, values);
     428      return dataset;
    414429    }
    415430    #endregion
     
    437452
    438453    #region Transactions
    439     // Snapshot/History are not storable/cloneable on purpose
     454    // Stapshot/History are nost storable/cloneable on purpose
    440455    private class Snapshot {
    441       public List<PreprocessingDataColumn> DataColumns { get; set; }
     456      public IList<IList> VariableValues { get; set; }
     457      public IList<string> VariableNames { get; set; }
    442458
    443459      public IntRange TrainingPartition { get; set; }
     
    456472    }
    457473
    458     private const int MaxUndoDepth = 5;
     474    private const int MAX_UNDO_DEPTH = 5;
    459475
    460476    private readonly IList<Snapshot> undoHistory = new List<Snapshot>();
     
    466482      if (IsInTransaction) return;
    467483
    468       var cloner = new Cloner();
    469484      var currentSnapshot = new Snapshot {
    470         DataColumns = new List<PreprocessingDataColumn>(dataColumns.Select(cloner.Clone)),
     485        VariableValues = CopyVariableValues(variableValues),
     486        VariableNames = new List<string>(variableNames),
    471487        TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End),
    472488        TestPartition = new IntRange(TestPartition.Start, TestPartition.End),
     
    477493      };
    478494
    479       if (undoHistory.Count >= MaxUndoDepth)
     495      if (undoHistory.Count >= MAX_UNDO_DEPTH)
    480496        undoHistory.RemoveAt(0);
    481497
     
    490506      if (IsUndoAvailable) {
    491507        Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1];
    492         dataColumns = previousSnapshot.DataColumns;
     508        variableValues = previousSnapshot.VariableValues;
     509        variableNames = previousSnapshot.VariableNames;
    493510        TrainingPartition = previousSnapshot.TrainingPartition;
    494511        TestPartition = previousSnapshot.TestPartition;
     
    521538    #endregion
    522539
    523     /* #region Statistics
    524      public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    525        try {
    526          return dataColumns[columnIndex].TypeSwitch<T>(
    527            col => col.GetMin(considerSelection ? Selection[columnIndex] : null),
    528            col => col.GetMin(considerSelection ? Selection[columnIndex] : null),
    529            col => col.GetMin(considerSelection ? Selection[columnIndex] : null));
    530        } catch (InvalidOperationException) {
    531          return emptyValue;
    532        }
    533      }
    534 
    535      public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    536        var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    537        return values.Any() ? values.Max() : emptyValue;
    538      }
    539 
    540      public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    541        return
    542 
    543 
    544        if (typeof(T) == typeof(double)) {
    545          var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    546          return values.Any() ? Convert<T>(values.Average()) : emptyValue;
    547        }
    548        if (typeof(T) == typeof(string)) {
    549          return Convert<T>(string.Empty);
    550        }
    551        if (typeof(T) == typeof(DateTime)) {
    552          var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
    553          return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue;
    554        }
    555 
    556        throw new InvalidOperationException(typeof(T) + " not supported");
    557      }
    558 
    559      public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
    560        if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
    561          var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    562          return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue;
    563        }
    564        var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    565        return values.Any() ? values.Quantile(0.5) : emptyValue;
    566      }
    567 
    568      public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {
    569        var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    570        return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue;
    571      }
    572 
    573      public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    574        if (typeof(T) == typeof(double)) {
    575          var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    576          return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue;
    577        }
    578        // For DateTime, std.dev / variance would have to be TimeSpan
    579        //if (typeof(T) == typeof(DateTime)) {
    580        //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
    581        //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue;
    582        //}
    583        return default(T);
    584      }
    585 
    586      public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
    587        if (typeof(T) == typeof(double)) {
    588          var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    589          return values.Any() ? Convert<T>(values.Variance()) : emptyValue;
    590        }
    591        // DateTime variance often overflows long, thus the corresponding DateTime is invalid
    592        //if (typeof(T) == typeof(DateTime)) {
    593        //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
    594        //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue;
    595        //}
    596        return default(T);
    597      }
    598 
    599      public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
    600        if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
    601          var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
    602          return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue;
    603        }
    604        var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    605        return values.Any() ? values.Quantile(alpha) : emptyValue;
    606      }
    607 
    608      public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {
    609        var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
    610        return values.GroupBy(x => x).Count();
    611      }
    612 
    613      private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) {
    614        return GetValues<T>(columnIndex, considerSelection).Where(x =>
    615          ColumnTypeSwitch<T, bool>(dataColumns[columnIndex], x,
    616            (c, v) => c.IsValidValue(v),
    617            (c, v) => c.IsValidValue(v),
    618            (c, v) => c.IsValidValue(v)
    619        ));
    620      }
    621 
    622      private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {
    623        return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond));
    624      }
    625 
    626      public int GetMissingValueCount() {
    627        int count = 0;
    628        for (int i = 0; i < Columns; ++i) {
    629          count += GetMissingValueCount(i);
    630        }
    631        return count;
    632      }
    633      public int GetMissingValueCount(int columnIndex) {
    634        int sum = 0;
    635        for (int i = 0; i < Rows; i++) {
    636          if (IsCellEmpty(columnIndex, i))
    637            sum++;
    638        }
    639        return sum;
    640      }
    641      public int GetRowMissingValueCount(int rowIndex) {
    642        int sum = 0;
    643        for (int i = 0; i < Columns; i++) {
    644          if (IsCellEmpty(i, rowIndex))
    645            sum++;
    646        }
    647        return sum;
    648      }
    649      #endregion  */
     540    #region Statistics
     541    public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     542      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     543      return values.Any() ? values.Min() : emptyValue;
     544    }
     545
     546    public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     547      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     548      return values.Any() ? values.Max() : emptyValue;
     549    }
     550
     551    public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     552      if (typeof(T) == typeof(double)) {
     553        var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     554        return values.Any() ? Convert<T>(values.Average()) : emptyValue;
     555      }
     556      if (typeof(T) == typeof(string)) {
     557        return Convert<T>(string.Empty);
     558      }
     559      if (typeof(T) == typeof(DateTime)) {
     560        var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
     561        return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue;
     562      }
     563
     564      throw new InvalidOperationException(typeof(T) + " not supported");
     565    }
     566
     567    public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
     568      if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
     569        var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     570        return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue;
     571      }
     572      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     573      return values.Any() ? values.Quantile(0.5) : emptyValue;
     574    }
     575
     576    public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {
     577      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     578      return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue;
     579    }
     580
     581    public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     582      if (typeof(T) == typeof(double)) {
     583        var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     584        return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue;
     585      }
     586      // For DateTime, std.dev / variance would have to be TimeSpan
     587      //if (typeof(T) == typeof(DateTime)) {
     588      //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
     589      //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue;
     590      //}
     591      return default(T);
     592    }
     593
     594    public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {
     595      if (typeof(T) == typeof(double)) {
     596        var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     597        return values.Any() ? Convert<T>(values.Variance()) : emptyValue;
     598      }
     599      // DateTime variance often overflows long, thus the corresponding DateTime is invalid
     600      //if (typeof(T) == typeof(DateTime)) {
     601      //  var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection);
     602      //  return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue;
     603      //}
     604      return default(T);
     605    }
     606
     607    public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {
     608      if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 
     609        var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection);
     610        return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue;
     611      }
     612      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     613      return values.Any() ? values.Quantile(alpha) : emptyValue;
     614    }
     615
     616    public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {
     617      var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection);
     618      return values.GroupBy(x => x).Count();
     619    }
     620
     621    private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) {
     622      return GetValues<T>(columnIndex, considerSelection).Where(x => !IsMissingValue(x));
     623    }
     624
     625    private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) {
     626      return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond));
     627    }
     628    private static T Convert<T>(object obj) { return (T)obj; }
     629
     630    public int GetMissingValueCount() {
     631      int count = 0;
     632      for (int i = 0; i < Columns; ++i) {
     633        count += GetMissingValueCount(i);
     634      }
     635      return count;
     636    }
     637    public int GetMissingValueCount(int columnIndex) {
     638      int sum = 0;
     639      for (int i = 0; i < Rows; i++) {
     640        if (IsCellEmpty(columnIndex, i))
     641          sum++;
     642      }
     643      return sum;
     644    }
     645    public int GetRowMissingValueCount(int rowIndex) {
     646      int sum = 0;
     647      for (int i = 0; i < Columns; i++) {
     648        if (IsCellEmpty(i, rowIndex))
     649          sum++;
     650      }
     651      return sum;
     652    }
     653    #endregion
     654
     655    #region Helpers
     656    private static IList<IList> CopyVariableValues(IList<IList> original) {
     657      var copy = new List<IList>(original);
     658      for (int i = 0; i < original.Count; ++i) {
     659        copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]);
     660      }
     661      return copy;
     662    }
     663    #endregion
    650664  }
    651665
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/HeuristicLab.DataPreprocessing-3.4.csproj

    r15291 r15431  
    123123    <Compile Include="Content\ScatterPlotContent.cs" />
    124124    <Compile Include="Content\DataCompletenessChartContent.cs" />
    125     <Compile Include="Data\Columns\DateTimePreprocessingDataColumn.cs" />
    126     <Compile Include="Data\Columns\DoublePreprocessingDataColumn.cs" />
    127     <Compile Include="Data\Columns\PreprocessingDataColumn.cs" />
    128     <Compile Include="Data\Columns\StringPreprocessingDataColumn.cs" />
    129125    <Compile Include="Data\FilteredPreprocessingData.cs" />
    130126    <Compile Include="Content\ManipulationContent.cs" />
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/PreprocessingTransformator.cs

    r15309 r15431  
    8383          int colIndex = preprocessingData.GetColumnIndex(transformation.Column);
    8484          var originalData = preprocessingData.GetValues<double>(colIndex);
    85           originalColumns.Add(transformation.Column, originalData.ToList());
     85          originalColumns.Add(transformation.Column, originalData);
    8686        }
    8787      }
     
    107107    }
    108108
    109     private IEnumerable<double> ApplyDoubleTransformation(Transformation<double> transformation, IEnumerable<double> data, out bool success, out string errorMsg) {
     109    private IEnumerable<double> ApplyDoubleTransformation(Transformation<double> transformation, IList<double> data, out bool success, out string errorMsg) {
    110110      success = transformation.Check(data, out errorMsg);
    111111      // don't apply when the check fails
  • branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/ProblemDataCreator.cs

    r15309 r15431  
    129129    }
    130130
    131     private bool IsNotConstantInputVariable(IEnumerable<double> list) {
     131    private bool IsNotConstantInputVariable(IList<double> list) {
    132132      return context.Data.TrainingPartition.End - context.Data.TrainingPartition.Start > 1 || list.Range() > 0;
    133133    }
Note: See TracChangeset for help on using the changeset viewer.