Changeset 15431 for branches/DataPreprocessing Cleanup
- Timestamp:
- 10/25/17 12:38:12 (7 years ago)
- Location:
- branches/DataPreprocessing Cleanup
- Files:
-
- 1 deleted
- 12 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/DataPreprocessing Cleanup/DataPreprocessing Cleanup.sln
r15291 r15431 2 2 Microsoft Visual Studio Solution File, Format Version 12.00 3 3 # Visual Studio 15 4 VisualStudioVersion = 15.0.26430.1 64 VisualStudioVersion = 15.0.26430.15 5 5 MinimumVisualStudioVersion = 10.0.40219.1 6 6 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HeuristicLab.DataPreprocessing-3.4", "HeuristicLab.DataPreprocessing\3.4\HeuristicLab.DataPreprocessing-3.4.csproj", "{3B90F866-70F8-43EF-A541-51819D255B7B}" -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing.Views/3.4/StatisticsView.cs
r15309 r15431 92 92 numericColumnsTextBox.Text = GetColumnCount<double>().ToString(); 93 93 nominalColumnsTextBox5.Text = GetColumnCount<string>().ToString(); 94 missingValuesTextBox.Text = data. DataColumns.Sum(c => c.GetNumberOfMissingValues()).ToString();95 totalValuesTextBox.Text = (data.Rows * data.Rows - data. DataColumns.Sum(c => c.GetNumberOfMissingValues())).ToString();94 missingValuesTextBox.Text = data.GetMissingValueCount().ToString(); 95 totalValuesTextBox.Text = (data.Rows * data.Rows - data.GetMissingValueCount()).ToString(); 96 96 97 97 var variableNames = Content.PreprocessingData.VariableNames.ToList(); … … 111 111 for (int j = 0; j < statistics.Count; j++) { 112 112 if (horizontal) 113 statisticsMatrix[j, i] = statistics[j] .ToString();113 statisticsMatrix[j, i] = statistics[j]; 114 114 else 115 statisticsMatrix[i, j] = statistics[j] .ToString();115 statisticsMatrix[i, j] = statistics[j]; 116 116 } 117 117 } … … 148 148 } 149 149 150 private IListGetStatistics(int varIdx) {151 IListlist;150 private List<string> GetStatistics(int varIdx) { 151 List<string> list; 152 152 var data = Content.PreprocessingData; 153 153 if (data.VariableHasType<double>(varIdx)) { … … 166 166 } 167 167 168 private IListGetDoubleColumns(int statIdx) {169 var column = (DoublePreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx];170 return new List< object> {171 column.GetValueType().Name,172 column.GetNumberOfMissingValues(),173 column.GetMin(),174 column.GetMax(),175 column.GetMedian(),176 column.GetMean(),177 column.GetStandardDeviation(),178 column.GetVariance(),179 column.GetQuantile(0.25),180 column.GetQuantile(0.75),181 column.GetMode(),182 column.GetDistinctValues()168 private List<string> GetDoubleColumns(int statIdx) { 169 var data = Content.PreprocessingData; 170 return new List<string> { 171 data.GetVariableType(statIdx).Name, 172 data.GetMissingValueCount(statIdx).ToString(), 173 data.GetMin<double>(statIdx, emptyValue: double.NaN).ToString(), 174 data.GetMax<double>(statIdx, emptyValue: double.NaN).ToString(), 175 data.GetMedian<double>(statIdx, emptyValue: double.NaN).ToString(), 176 data.GetMean<double>(statIdx, emptyValue: double.NaN).ToString(), 177 data.GetStandardDeviation<double>(statIdx, emptyValue: double.NaN).ToString(), 178 data.GetVariance<double>(statIdx, emptyValue: double.NaN).ToString(), 179 data.GetQuantile<double>(0.25, statIdx, emptyValue: double.NaN).ToString(), 180 data.GetQuantile<double>(0.75, statIdx, emptyValue: double.NaN).ToString(), 181 data.GetMode<double>(statIdx, emptyValue: double.NaN).ToString(), 182 data.GetDistinctValues<double>(statIdx).ToString() 183 183 }; 184 184 } 185 185 186 private IListGetStringColumns(int statIdx) {187 var column = (StringPreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx];188 return new List< object> {189 column.GetValueType().Name,190 column.GetNumberOfMissingValues(),191 "", // min192 "", // max193 "", // median186 private List<string> GetStringColumns(int statIdx) { 187 var data = Content.PreprocessingData; 188 return new List<string> { 189 data.GetVariableType(statIdx).Name, 190 data.GetMissingValueCount(statIdx).ToString(), 191 "", // data.GetMin<string>(statIdx, emptyValue: string.Empty), //min 192 "", // data.GetMax<string>(statIdx, emptyValue: string.Empty), //max 193 "", // data.GetMedian<string>(statIdx, emptyValue: string.Empty), //median 194 194 "", //average 195 195 "", //standard deviation 196 196 "", //variance 197 "", // quarter percentile198 "", // three quarter percentile199 column.GetMode(),200 column.GetDistinctValues()197 "", // data.GetQuantile<string>(0.25, statIdx, emptyValue: string.Empty), //quarter percentile 198 "", // data.GetQuantile<string>(0.75, statIdx, emptyValue: string.Empty), //three quarter percentile 199 data.GetMode<string>(statIdx, emptyValue: string.Empty), 200 data.GetDistinctValues<string>(statIdx).ToString() 201 201 }; 202 202 } 203 203 204 private IListGetDateTimeColumns(int statIdx) {205 var column = (DateTimePreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx];206 return new List< object> {207 column.GetValueType().Name,208 column.GetNumberOfMissingValues(),209 column.GetMin(),210 column.GetMax(),211 column.GetMedian(),212 column.GetMean(),213 column.GetStandardDeviation(),214 /*column.GetVariance()*/"", // variance (in ticks) is usually to high to display a valid TimeSpan or DateTime215 column.GetQuantile(0.25),216 column.GetQuantile(0.75),217 column.GetMode(),218 column.GetDistinctValues()204 private List<string> GetDateTimeColumns(int statIdx) { 205 var data = Content.PreprocessingData; 206 return new List<string> { 207 data.GetVariableType(statIdx).Name, 208 data.GetMissingValueCount(statIdx).ToString(), 209 data.GetMin<DateTime>(statIdx).ToString(), 210 data.GetMax<DateTime>(statIdx).ToString(), 211 data.GetMedian<DateTime>(statIdx).ToString(), 212 data.GetMean<DateTime>(statIdx).ToString(), 213 "", // should be of type TimeSpan //data.GetStandardDeviation<DateTime>(statIdx).ToString(), 214 "", // should be of type TimeSpan //data.GetVariance<DateTime>(statIdx).ToString(), 215 data.GetQuantile<DateTime>(0.25, statIdx).ToString(), 216 data.GetQuantile<DateTime>(0.75, statIdx).ToString(), 217 data.GetMode<DateTime>(statIdx).ToString(), 218 data.GetDistinctValues<DateTime>(statIdx).ToString() 219 219 }; 220 220 } -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/DataGridContent.cs
r15309 r15431 90 90 91 91 public void DeleteRows(IEnumerable<int> rows) { 92 PreprocessingData.DeleteRows (rows);92 PreprocessingData.DeleteRowsWithIndices(rows); 93 93 } 94 94 … … 134 134 135 135 #region Manipulations 136 private void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, Func<int, double> doubleAggregator = null, 137 Func<int, DateTime> dateTimeAggregator = null, Func<int, string> stringAggregator = null) { 138 PreprocessingData.InTransaction(() => { 139 foreach (var column in cells) { 140 if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) { 141 var value = doubleAggregator(column.Key); 142 foreach (int index in column.Value) 143 PreprocessingData.SetCell<double>(column.Key, index, value); 144 } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) { 145 var value = dateTimeAggregator(column.Key); 146 foreach (int index in column.Value) 147 PreprocessingData.SetCell<DateTime>(column.Key, index, value); 148 } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) { 149 var value = stringAggregator(column.Key); 150 foreach (int index in column.Value) 151 PreprocessingData.SetCell<string>(column.Key, index, value); 152 } 153 } 154 }); 155 } 156 157 private void ReplaceIndicesByValues(IDictionary<int, IList<int>> cells, Func<int, IEnumerable<double>> doubleAggregator = null, 158 Func<int, IEnumerable<DateTime>> dateTimeAggregator = null, Func<int, IEnumerable<string>> stringAggregator = null) { 159 PreprocessingData.InTransaction(() => { 160 foreach (var column in cells) { 161 if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) { 162 var values = doubleAggregator(column.Key); 163 foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value })) 164 PreprocessingData.SetCell<double>(column.Key, pair.row, pair.value); 165 } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) { 166 var values = dateTimeAggregator(column.Key); 167 foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value })) 168 PreprocessingData.SetCell<DateTime>(column.Key, pair.row, pair.value); 169 } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) { 170 var values = stringAggregator(column.Key); 171 foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value })) 172 PreprocessingData.SetCell<string>(column.Key, pair.row, pair.value); 173 } 174 } 175 }); 176 } 177 136 178 public void ReplaceIndicesByMean(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 137 PreprocessingData.InTransaction(() => { 138 foreach (var column in cells) { 139 PreprocessingData.DataColumns[column.Key].TypeSwitch( 140 c => { 141 var mean = c.GetMean(considerSelection ? column.Value : null); 142 foreach (var index in column.Value) c[index] = mean; 143 }, 144 dateTimeAction: c => { 145 var mean = c.GetMean(considerSelection ? column.Value : null); 146 foreach (var index in column.Value) c[index] = mean; 147 }); 148 } 149 }); 179 ReplaceIndicesByValue(cells, 180 col => PreprocessingData.GetMean<double>(col, considerSelection), 181 col => PreprocessingData.GetMean<DateTime>(col, considerSelection)); 150 182 } 151 183 152 184 public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 153 PreprocessingData.InTransaction(() => { 154 foreach (var column in cells) { 155 PreprocessingData.DataColumns[column.Key].TypeSwitch( 156 c => { 157 var median = c.GetMedian(considerSelection ? column.Value : null); 158 foreach (var index in column.Value) c[index] = median; 159 }, 160 c => { 161 var median = c.GetMedian(considerSelection ? column.Value : null); 162 foreach (var index in column.Value) c[index] = median; 163 }, 164 c => { 165 var median = c.GetMedian(considerSelection ? column.Value : null); 166 foreach (var index in column.Value) c[index] = median; 167 }); 168 } 169 }); 185 ReplaceIndicesByValue(cells, 186 col => PreprocessingData.GetMedian<double>(col, considerSelection), 187 col => PreprocessingData.GetMedian<DateTime>(col, considerSelection)); 170 188 } 171 189 172 190 public void ReplaceIndicesByMode(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 173 PreprocessingData.InTransaction(() => { 174 foreach (var column in cells) { 175 PreprocessingData.DataColumns[column.Key].TypeSwitch( 176 c => { 177 var mode = c.GetMode(considerSelection ? column.Value : null); 178 foreach (var index in column.Value) c[index] = mode; 179 }, 180 c => { 181 var mode = c.GetMode(considerSelection ? column.Value : null); 182 foreach (var index in column.Value) c[index] = mode; 183 }, 184 c => { 185 var mode = c.GetMode(considerSelection ? column.Value : null); 186 foreach (var index in column.Value) c[index] = mode; 187 }); 188 } 189 }); 191 ReplaceIndicesByValue(cells, 192 col => PreprocessingData.GetMode<double>(col, considerSelection), 193 col => PreprocessingData.GetMode<DateTime>(col, considerSelection), 194 col => PreprocessingData.GetMode<string>(col, considerSelection)); 190 195 } 191 196 192 197 public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 193 198 var rand = new FastRandom(); 194 PreprocessingData.InTransaction(() => { 195 foreach (var column in cells) { 196 PreprocessingData.DataColumns[column.Key].TypeSwitch( 197 c => { 198 double min = c.GetMin(considerSelection ? column.Value : null); 199 double max = c.GetMax(considerSelection ? column.Value : null); 200 double range = max - min; 201 foreach (var index in column.Value) c[index] = min + rand.NextDouble() * range; 202 }, 203 dateTimeAction: c => { 204 var min = c.GetMin(considerSelection ? column.Value : null); 205 var max = c.GetMax(considerSelection ? column.Value : null); 206 double range = (max - min).TotalSeconds; 207 foreach (var index in column.Value) c[index] = min + TimeSpan.FromSeconds(rand.NextDouble() * range); 208 }); 209 } 210 }); 199 ReplaceIndicesByValues(cells, 200 col => { 201 double min = PreprocessingData.GetMin<double>(col, considerSelection); 202 double max = PreprocessingData.GetMax<double>(col, considerSelection); 203 double range = max - min; 204 return cells[col].Select(_ => rand.NextDouble() * range + min); 205 }, 206 col => { 207 var min = PreprocessingData.GetMin<DateTime>(col, considerSelection); 208 var max = PreprocessingData.GetMax<DateTime>(col, considerSelection); 209 double range = (max - min).TotalSeconds; 210 return cells[col].Select(_ => min + TimeSpan.FromSeconds(rand.NextDouble() * range)); 211 }); 211 212 } 212 213 … … 215 216 foreach (var column in cells) { 216 217 foreach (var rowIdx in column.Value) { 217 PreprocessingData. DataColumns[column.Key].SetValue(value, rowIdx);218 PreprocessingData.SetValue(value, column.Key, rowIdx); 218 219 } 219 220 } … … 256 257 int valuesToInterpolate = nextIndex - prevIndex; 257 258 258 PreprocessingData.DataColumns[column.Key].TypeSwitch( 259 c => { 260 double prev = c[prevIndex]; 261 double next = c[nextIndex]; 262 double interpolationStep = (next - prev) / valuesToInterpolate; 263 for (int i = prevIndex; i < nextIndex; i++) c[i] = prev + (interpolationStep * (i - prevIndex)); 264 }, 265 dateTimeAction: c => { 266 var prev = c[prevIndex]; 267 var next = c[nextIndex]; 268 double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate; 269 for (int i = prevIndex; i < nextIndex; i++) c[i] = prev.AddSeconds(interpolationStep * (i - prevIndex)); 270 } 271 ); 259 if (PreprocessingData.VariableHasType<double>(column.Key)) { 260 double prev = PreprocessingData.GetCell<double>(column.Key, prevIndex); 261 double next = PreprocessingData.GetCell<double>(column.Key, nextIndex); 262 double interpolationStep = (next - prev) / valuesToInterpolate; 263 264 for (int i = prevIndex; i < nextIndex; ++i) { 265 double interpolated = prev + (interpolationStep * (i - prevIndex)); 266 PreprocessingData.SetCell<double>(column.Key, i, interpolated); 267 } 268 } else if (PreprocessingData.VariableHasType<DateTime>(column.Key)) { 269 DateTime prev = PreprocessingData.GetCell<DateTime>(column.Key, prevIndex); 270 DateTime next = PreprocessingData.GetCell<DateTime>(column.Key, nextIndex); 271 double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate; 272 273 for (int i = prevIndex; i < nextIndex; ++i) { 274 DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex)); 275 PreprocessingData.SetCell<DateTime>(column.Key, i, interpolated); 276 } 277 } 272 278 } 273 279 274 280 private int IndexOfPrevPresentValue(int columnIndex, int start) { 275 int index = start - 1; 276 while (index >= 0 && PreprocessingData.IsCellEmpty(columnIndex, index)) 277 index--; 278 return index; 281 int offset = start - 1; 282 while (offset >= 0 && PreprocessingData.IsCellEmpty(columnIndex, offset)) { 283 offset--; 284 } 285 286 return offset; 279 287 } 280 288 281 289 private int IndexOfNextPresentValue(int columnIndex, int start) { 282 int index = start + 1; 283 while (index < PreprocessingData.Rows && PreprocessingData.IsCellEmpty(columnIndex, index)) 284 index++; 285 return index; 290 int offset = start + 1; 291 while (offset < PreprocessingData.Rows && PreprocessingData.IsCellEmpty(columnIndex, offset)) { 292 offset++; 293 } 294 295 return offset; 286 296 } 287 297 … … 293 303 PreprocessingData.InTransaction(() => { 294 304 // process all given ranges - e.g. TrainingPartition, TestPartition 295 foreach ( varrange in ranges) {305 foreach (IntRange range in ranges) { 296 306 var indices = Enumerable.Range(0, PreprocessingData.Rows).ToArray(); 297 307 var shuffledIndices = Enumerable.Range(range.Start, range.Size).Shuffle(random).ToArray(); … … 314 324 public void ReOrderToIndices(int[] indices) { 315 325 PreprocessingData.InTransaction(() => { 316 for each (var column in PreprocessingData.DataColumns) {317 column.TypeSwitch(318 c => {319 if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");320 var originalData = new List<double>(c.Values);321 for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]];322 },323 c => {324 if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");325 var originalData = new List<string>(c.Values);326 for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]]; 327 },328 c => {329 if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");330 var originalData = new List<DateTime>(c.Values); 331 for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]];332 });333 }334 } );326 for (int i = 0; i < PreprocessingData.Columns; ++i) { 327 if (PreprocessingData.VariableHasType<double>(i)) 328 ReOrderToIndices<double>(i, indices); 329 else if (PreprocessingData.VariableHasType<string>(i)) 330 ReOrderToIndices<string>(i, indices); 331 else if (PreprocessingData.VariableHasType<DateTime>(i)) 332 ReOrderToIndices<DateTime>(i, indices); 333 } 334 }); 335 } 336 337 private void ReOrderToIndices<T>(int columnIndex, int[] indices) { 338 var originalData = new List<T>(PreprocessingData.GetValues<T>(columnIndex)); 339 if (indices.Length != originalData.Count) throw new InvalidOperationException("The number of provided indices does not match the values."); 340 341 for (int i = 0; i < indices.Length; i++) { 342 T newValue = originalData[indices[i]]; 343 PreprocessingData.SetCell<T>(columnIndex, i, newValue); 344 } 335 345 } 336 346 #endregion -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/ManipulationContent.cs
r15309 r15431 57 57 58 58 for (int i = 0; i < PreprocessingData.Rows; ++i) { 59 int missingCount = 0; 60 for (var col = 0; col < PreprocessingData.DataColumns.Count; col++) { 61 if (!PreprocessingData.DataColumns[col].IsValidValue(i)) 62 missingCount++; 59 int missingCount = PreprocessingData.GetRowMissingValueCount(i); 60 if (100f / PreprocessingData.Columns * missingCount > percent) { 61 rows.Add(i); 63 62 } 64 if (100f / PreprocessingData.Columns * missingCount > percent)65 rows.Add(i);66 63 } 67 64 … … 72 69 List<int> columns = new List<int>(); 73 70 for (int i = 0; i < PreprocessingData.Columns; ++i) { 74 int missingCount = PreprocessingData. DataColumns[i].GetNumberOfMissingValues();71 int missingCount = PreprocessingData.GetMissingValueCount(i); 75 72 if (100f / PreprocessingData.Rows * missingCount > percent) { 76 73 columns.Add(i); … … 83 80 public List<int> ColumnsWithVarianceSmaller(double variance) { 84 81 List<int> columns = new List<int>(); 85 86 for (int i = 0; i < PreprocessingData.Columns; i++) { 87 if (PreprocessingData.DataColumns[i].TypeSwitch<bool>( 88 c => c.GetVariance() < variance, 89 c => false, 90 c => c.GetVariance().Ticks / TimeSpan.TicksPerSecond < variance 91 )) 92 columns.Add(i); 82 for (int i = 0; i < PreprocessingData.Columns; ++i) { 83 if (PreprocessingData.VariableHasType<double>(i)) { 84 double columnVariance = PreprocessingData.GetVariance<double>(i); 85 if (columnVariance < variance) { 86 columns.Add(i); 87 } 88 } else if (PreprocessingData.VariableHasType<DateTime>(i)) { 89 double columnVariance = (double)PreprocessingData.GetVariance<DateTime>(i).Ticks / TimeSpan.TicksPerSecond; 90 if (columnVariance < variance) { 91 columns.Add(i); 92 } 93 } 93 94 } 94 95 95 return columns; 96 96 } … … 119 119 PreprocessingData.InTransaction(() => { 120 120 foreach (int column in columns.OrderByDescending(x => x)) { 121 PreprocessingData.D ataColumns.RemoveAt(column);121 PreprocessingData.DeleteColumn(column); 122 122 } 123 123 }); -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/PreprocessingChartContent.cs
r15309 r15431 82 82 83 83 public static DataRow CreateDataRow(IFilteredPreprocessingData preprocessingData, string variableName, DataRowVisualProperties.DataRowChartType chartType) { 84 varvalues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableName));84 IList<double> values = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableName)); 85 85 DataRow row = new DataRow(variableName, "", values); 86 86 row.VisualProperties.ChartType = chartType; -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/ScatterPlotContent.cs
r15309 r15431 21 21 22 22 using System; 23 using System.Collections.Generic; 23 24 using System.Linq; 24 25 using HeuristicLab.Analysis; … … 50 51 #endregion 51 52 52 public static ScatterPlot CreateScatterPlot(IFilteredPreprocessingData preprocessingData, string variableNameX, string variableNameY, 53 string variableNameGroup = "-", LegendOrder legendOrder = LegendOrder.Alphabetically) { 53 public static ScatterPlot CreateScatterPlot(IFilteredPreprocessingData preprocessingData, string variableNameX, string variableNameY, string variableNameGroup = "-", LegendOrder legendOrder = LegendOrder.Alphabetically) { 54 54 ScatterPlot scatterPlot = new ScatterPlot(); 55 55 56 varxValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameX));57 varyValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameY));56 IList<double> xValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameX)); 57 IList<double> yValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameY)); 58 58 59 59 var points = xValues.Zip(yValues, (x, y) => new Point2D<double>(x, y)).ToList(); -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/FilteredPreprocessingData.cs
r15309 r15431 38 38 private IPreprocessingData filteredData; 39 39 40 public IList<PreprocessingDataColumn> DataColumns {41 get { return ActiveData.DataColumns; }42 }43 44 40 public IPreprocessingData ActiveData { 45 41 get { return IsFiltered ? filteredData : originalData; } … … 86 82 } 87 83 88 public I Enumerable<T> GetValues<T>(int columnIndex, bool considerSelection) {84 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 89 85 return ActiveData.GetValues<T>(columnIndex, considerSelection); 90 86 } 91 87 92 public void SetValues<T>(int columnIndex, I Enumerable<T> values) {88 public void SetValues<T>(int columnIndex, IList<T> values) { 93 89 if (IsFiltered) 94 90 throw new InvalidOperationException("SetValues not possible while data is filtered"); … … 127 123 } 128 124 129 public void DeleteRows (IEnumerable<int> rows) {125 public void DeleteRowsWithIndices(IEnumerable<int> rows) { 130 126 if (IsFiltered) 131 127 throw new InvalidOperationException("DeleteRowsWithIndices not possible while data is filtered"); 132 128 133 originalData.DeleteRows (rows);129 originalData.DeleteRowsWithIndices(rows); 134 130 } 135 131 … … 277 273 public void EndTransaction() { 278 274 originalData.EndTransaction(); 275 } 276 #endregion 277 278 #region Statistics 279 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 280 return ActiveData.GetMin<T>(columnIndex, considerSelection, emptyValue); 281 } 282 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 283 return ActiveData.GetMax<T>(columnIndex, considerSelection, emptyValue); 284 } 285 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 286 return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue); 287 } 288 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 289 return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue); 290 } 291 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 292 return ActiveData.GetMode<T>(columnIndex, considerSelection, emptyValue); 293 } 294 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 295 return ActiveData.GetStandardDeviation<T>(columnIndex, considerSelection, emptyValue); 296 } 297 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 298 return ActiveData.GetVariance<T>(columnIndex, considerSelection, emptyValue); 299 } 300 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 301 return ActiveData.GetQuantile<T>(alpha, columnIndex, considerSelection, emptyValue); 302 } 303 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 304 return ActiveData.GetDistinctValues<T>(columnIndex, considerSelection); 305 } 306 307 public int GetMissingValueCount() { 308 return ActiveData.GetMissingValueCount(); 309 } 310 public int GetMissingValueCount(int columnIndex) { 311 return ActiveData.GetMissingValueCount(columnIndex); 312 } 313 public int GetRowMissingValueCount(int rowIndex) { 314 return ActiveData.GetRowMissingValueCount(rowIndex); 279 315 } 280 316 #endregion -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/IPreprocessingData.cs
r15309 r15431 28 28 namespace HeuristicLab.DataPreprocessing { 29 29 public interface IPreprocessingData : INamedItem { 30 31 IList<PreprocessingDataColumn> DataColumns { get; }32 33 30 #region Cells 34 31 bool IsCellEmpty(int columnIndex, int rowIndex); … … 39 36 string GetCellAsString(int columnIndex, int rowIndex); 40 37 41 I Enumerable<T> GetValues<T>(int columnIndex, bool considerSelection = false);38 IList<T> GetValues<T>(int columnIndex, bool considerSelection = false); 42 39 43 void SetValues<T>(int columnIndex, I Enumerable<T> values);40 void SetValues<T>(int columnIndex, IList<T> values); 44 41 bool SetValue(string value, int columnIndex, int rowIndex); 45 42 … … 51 48 void InsertRow(int rowIndex); 52 49 void DeleteRow(int rowIndex); 53 void DeleteRows (IEnumerable<int> rows);50 void DeleteRowsWithIndices(IEnumerable<int> rows); 54 51 void InsertColumn<T>(string variableName, int columnIndex); 55 52 … … 109 106 void EndTransaction(); 110 107 #endregion 108 109 #region Statistics 110 T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 111 T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 112 T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 113 T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>; 114 T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T>; 115 T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 116 T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)); 117 T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>; 118 int GetDistinctValues<T>(int columnIndex, bool considerSelection = false); 119 120 int GetMissingValueCount(); 121 int GetMissingValueCount(int columnIndex); 122 int GetRowMissingValueCount(int rowIndex); 123 #endregion 111 124 } 112 125 } -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
r15309 r15431 32 32 33 33 namespace HeuristicLab.DataPreprocessing { 34 34 35 [Item("PreprocessingData", "Represents data used for preprocessing.")] 35 36 [StorableClass] 36 37 public class PreprocessingData : NamedItem, IPreprocessingData { 37 38 38 [Storable] private List<PreprocessingDataColumn> dataColumns; 39 40 public IList<PreprocessingDataColumn> DataColumns { 41 get { return dataColumns; } 42 } 43 39 [Storable] 40 protected IList<IList> variableValues; 41 [Storable] 42 protected IList<string> variableNames; 44 43 45 44 #region Constructor, Cloning & Persistence … … 48 47 Name = "Preprocessing Data"; 49 48 50 dataColumns = new List<PreprocessingDataColumn>();51 49 Transformations = new List<ITransformation>(); 52 50 selection = new Dictionary<int, IList<int>>(); … … 59 57 protected PreprocessingData(PreprocessingData original, Cloner cloner) 60 58 : base(original, cloner) { 61 dataColumns = new List<PreprocessingDataColumn>(original.dataColumns.Select(cloner.Clone)); 62 TrainingPartition = cloner.Clone(original.TrainingPartition); 63 TestPartition = cloner.Clone(original.TestPartition); 59 variableValues = CopyVariableValues(original.variableValues); 60 variableNames = new List<string>(original.variableNames); 61 TrainingPartition = (IntRange)original.TrainingPartition.Clone(cloner); 62 TestPartition = (IntRange)original.TestPartition.Clone(cloner); 64 63 Transformations = new List<ITransformation>(original.Transformations.Select(cloner.Clone)); 65 64 … … 100 99 #region Cells 101 100 public bool IsCellEmpty(int columnIndex, int rowIndex) { 102 return !dataColumns[columnIndex].IsValidValue(rowIndex); 101 var value = variableValues[columnIndex][rowIndex]; 102 return IsMissingValue(value); 103 103 } 104 104 105 105 public T GetCell<T>(int columnIndex, int rowIndex) { 106 return dataColumns[columnIndex].TypeSwitch<T>( 107 c => c[rowIndex], 108 c => c[rowIndex], 109 c => c[rowIndex]); 106 return (T)variableValues[columnIndex][rowIndex]; 110 107 } 111 108 … … 118 115 InsertColumn<T>(i.ToString(), i); 119 116 120 dataColumns[columnIndex].TypeSwitch<T>(value, 121 (c, v) => c[rowIndex] = v, 122 (c, v) => c[rowIndex] = v, 123 (c, v) => c[rowIndex] = v); 124 117 variableValues[columnIndex][rowIndex] = value; 125 118 if (!IsInTransaction) 126 119 OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex); … … 128 121 129 122 public string GetCellAsString(int columnIndex, int rowIndex) { 130 return dataColumns[columnIndex].GetValue(rowIndex); 131 } 132 133 public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) { 134 return dataColumns[columnIndex].TypeSwitch<T>( 135 c => c.GetValues(considerSelection ? selection[columnIndex] : null), 136 c => c.GetValues(considerSelection ? selection[columnIndex] : null), 137 c => c.GetValues(considerSelection ? selection[columnIndex] : null)); 138 } 139 140 public void SetValues<T>(int columnIndex, IEnumerable<T> values) { 123 return variableValues[columnIndex][rowIndex].ToString(); 124 } 125 126 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 127 if (considerSelection) { 128 var list = new List<T>(); 129 foreach (var rowIdx in selection[columnIndex]) { 130 list.Add((T)variableValues[columnIndex][rowIdx]); 131 } 132 return list; 133 } else { 134 return (IList<T>)variableValues[columnIndex]; 135 } 136 } 137 138 public void SetValues<T>(int columnIndex, IList<T> values) { 141 139 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 142 140 if (VariableHasType<T>(columnIndex)) { 143 var name = dataColumns[columnIndex].Name; 144 if (dataColumns[columnIndex].IsType<double>()) { 145 dataColumns[columnIndex] = new DoublePreprocessingDataColumn(name, (IEnumerable<double>)values); 146 } else if (dataColumns[columnIndex].IsType<string>()) { 147 dataColumns[columnIndex] = new StringPreprocessingDataColumn(name, (IEnumerable<string>)values); 148 } else if (dataColumns[columnIndex].IsType<DateTime>()) { 149 dataColumns[columnIndex] = new DateTimePreprocessingDataColumn(name, (IEnumerable<DateTime>)values); 141 variableValues[columnIndex] = (IList)values; 142 } else { 143 throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + variableValues[columnIndex].GetType().Name + " but was " + typeof(T).Name); 144 } 145 if (!IsInTransaction) 146 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 147 } 148 149 public bool SetValue(string value, int columnIndex, int rowIndex) { 150 bool valid = false; 151 if (VariableHasType<double>(columnIndex)) { 152 double val; 153 if (string.IsNullOrWhiteSpace(value)) { 154 val = double.NaN; 155 valid = true; 150 156 } else { 151 throw new ArgumentException("Unknown column type"); 152 } 157 valid = double.TryParse(value, out val); 158 } 159 if (valid) 160 SetCell(columnIndex, rowIndex, val); 161 } else if (VariableHasType<string>(columnIndex)) { 162 valid = value != null; 163 if (valid) 164 SetCell(columnIndex, rowIndex, value); 165 } else if (VariableHasType<DateTime>(columnIndex)) { 166 DateTime date; 167 valid = DateTime.TryParse(value, out date); 168 if (valid) 169 SetCell(columnIndex, rowIndex, date); 153 170 } else { 154 throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + dataColumns[columnIndex].GetType().Name + " but was " + typeof(T).Name); 155 } 171 throw new ArgumentException("column " + columnIndex + " contains a non supported type."); 172 } 173 156 174 if (!IsInTransaction) 157 175 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 158 } 159 160 public bool SetValue(string value, int columnIndex, int rowIndex) { 161 var column = dataColumns[columnIndex]; 162 bool successful = column.SetValue(value, rowIndex); 163 164 if (!IsInTransaction) 165 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 166 167 return successful; 176 177 return valid; 168 178 } 169 179 170 180 public int Columns { 171 get { return dataColumns.Count; }181 get { return variableNames.Count; } 172 182 } 173 183 174 184 public int Rows { 175 get { return dataColumns.Any() ? dataColumns.Max(c => c.Length) : 0; } 185 get { return variableValues.Count > 0 ? variableValues[0].Count : 0; } 186 } 187 188 public static bool IsMissingValue(object value) { 189 if (value is double) return double.IsNaN((double)value); 190 if (value is string) return string.IsNullOrEmpty((string)value); 191 if (value is DateTime) return ((DateTime)value).Equals(DateTime.MinValue); 192 throw new ArgumentException(); 176 193 } 177 194 #endregion … … 180 197 public void InsertRow(int rowIndex) { 181 198 SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 182 183 foreach (var column in dataColumns) { 184 column.TypeSwitch( 185 c => c.Values.Insert(rowIndex, double.NaN), 186 c => c.Values.Insert(rowIndex, null), 187 c => c.Values.Insert(rowIndex, DateTime.MinValue)); 188 } 189 199 foreach (IList column in variableValues) { 200 Type type = column.GetType().GetGenericArguments()[0]; 201 column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null); 202 } 190 203 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 191 204 TrainingPartition.End++; … … 201 214 } 202 215 } 203 204 216 if (!IsInTransaction) 205 217 OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 206 218 } 207 208 219 public void DeleteRow(int rowIndex) { 209 DeleteRows(new[] { rowIndex }); 210 } 211 public void DeleteRows(IEnumerable<int> rowIndices) { 220 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 221 foreach (IList column in variableValues) { 222 column.RemoveAt(rowIndex); 223 } 224 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 225 TrainingPartition.End--; 226 if (TrainingPartition.End <= TestPartition.Start) { 227 TestPartition.Start--; 228 TestPartition.End--; 229 } 230 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 231 TestPartition.End--; 232 if (TestPartition.End <= TrainingPartition.Start) { 233 TestPartition.Start--; 234 TestPartition.End--; 235 } 236 } 237 if (!IsInTransaction) 238 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 239 } 240 public void DeleteRowsWithIndices(IEnumerable<int> rows) { 212 241 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1); 213 214 foreach (int rowIndex in rowIndices.OrderByDescending(x => x)) { 215 foreach (var column in dataColumns) { 216 column.TypeSwitch( 217 c => c.Values.RemoveAt(rowIndex), 218 c => c.Values.RemoveAt(rowIndex), 219 c => c.Values.RemoveAt(rowIndex)); 220 } 221 242 foreach (int rowIndex in rows.OrderByDescending(x => x)) { 243 foreach (IList column in variableValues) { 244 column.RemoveAt(rowIndex); 245 } 222 246 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 223 247 TrainingPartition.End--; … … 234 258 } 235 259 } 236 237 260 if (!IsInTransaction) 238 261 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1); … … 241 264 public void InsertColumn<T>(string variableName, int columnIndex) { 242 265 SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); 243 244 if (typeof(T) == typeof(double)) { 245 dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double>(double.NaN, Rows))); 246 } else if (typeof(T) == typeof(string)) { 247 dataColumns.Insert(columnIndex, new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(string.Empty, Rows))); 248 } else if (typeof(T) == typeof(DateTime)) { 249 dataColumns.Insert(columnIndex, new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime>(DateTime.MinValue, Rows))); 250 } else { 251 throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime"); 252 } 253 266 variableValues.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows))); 267 variableNames.Insert(columnIndex, variableName); 254 268 if (!IsInTransaction) 255 269 OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); … … 258 272 public void DeleteColumn(int columnIndex) { 259 273 SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); 260 261 dataColumns.RemoveAt(columnIndex); 262 274 variableValues.RemoveAt(columnIndex); 275 variableNames.RemoveAt(columnIndex); 263 276 if (!IsInTransaction) 264 277 OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); … … 266 279 267 280 public void RenameColumn(int columnIndex, string name) { 268 if (columnIndex < 0 || columnIndex > dataColumns.Count) 281 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 282 if (columnIndex < 0 || columnIndex > variableNames.Count) 269 283 throw new ArgumentOutOfRangeException("columnIndex"); 270 271 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 272 273 dataColumns[columnIndex].Name = name; 284 variableNames[columnIndex] = name; 274 285 275 286 if (!IsInTransaction) … … 279 290 public void RenameColumns(IList<string> names) { 280 291 if (names == null) throw new ArgumentNullException("names"); 281 if (names.Count != dataColumns.Count) throw new ArgumentException("number of names must match the number of columns.", "names");292 if (names.Count != variableNames.Count) throw new ArgumentException("number of names must match the number of columns.", "names"); 282 293 283 294 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 284 285 295 for (int i = 0; i < names.Count; i++) 286 dataColumns[i].Name= names[i];296 variableNames[i] = names[i]; 287 297 288 298 if (!IsInTransaction) … … 291 301 292 302 public bool AreAllStringColumns(IEnumerable<int> columnIndices) { 293 return columnIndices.All( VariableHasType<string>);303 return columnIndices.All(x => VariableHasType<string>(x)); 294 304 } 295 305 #endregion … … 297 307 #region Variables 298 308 public IEnumerable<string> VariableNames { 299 get { return dataColumns.Select(c => c.Name); }309 get { return variableNames; } 300 310 } 301 311 302 312 public IEnumerable<string> GetDoubleVariableNames() { 303 return dataColumns.OfType<DoublePreprocessingDataColumn>().Select(c => c.Name); 313 var doubleVariableNames = new List<string>(); 314 for (int i = 0; i < Columns; ++i) { 315 if (VariableHasType<double>(i)) { 316 doubleVariableNames.Add(variableNames[i]); 317 } 318 } 319 return doubleVariableNames; 304 320 } 305 321 306 322 public string GetVariableName(int columnIndex) { 307 return dataColumns[columnIndex].Name;323 return variableNames[columnIndex]; 308 324 } 309 325 310 326 public int GetColumnIndex(string variableName) { 311 return dataColumns.FindIndex(c => c.Name ==variableName);327 return variableNames.IndexOf(variableName); 312 328 } 313 329 314 330 public bool VariableHasType<T>(int columnIndex) { 315 return dataColumns[columnIndex].IsType<T>();331 return columnIndex >= variableValues.Count || variableValues[columnIndex] is List<T>; 316 332 } 317 333 318 334 public Type GetVariableType(int columnIndex) { 319 return dataColumns[columnIndex].GetValueType(); 335 var listType = variableValues[columnIndex].GetType(); 336 return listType.GenericTypeArguments.Single(); 320 337 } 321 338 … … 375 392 #region Import & Export 376 393 public void Import(IDataAnalysisProblemData problemData) { 377 var dataset = problemData.Dataset; 394 Dataset dataset = (Dataset)problemData.Dataset; 395 variableNames = new List<string>(problemData.Dataset.VariableNames); 378 396 InputVariables = new List<string>(problemData.AllowedInputVariables); 379 TargetVariable = problemData is IRegressionProblemData ? ((IRegressionProblemData)problemData).TargetVariable 380 : problemData is IClassificationProblemData ? ((IClassificationProblemData)problemData).TargetVariable 381 : null; 382 383 dataColumns.Clear(); 397 TargetVariable = (problemData is IRegressionProblemData) ? ((IRegressionProblemData)problemData).TargetVariable 398 : (problemData is IClassificationProblemData) ? ((IClassificationProblemData)problemData).TargetVariable 399 : null; 400 401 int columnIndex = 0; 402 variableValues = new List<IList>(); 384 403 foreach (var variableName in problemData.Dataset.VariableNames) { 385 404 if (dataset.VariableHasType<double>(variableName)) { 386 dataColumns.Add(new DoublePreprocessingDataColumn(variableName, dataset.GetDoubleValues(variableName)));405 variableValues.Insert(columnIndex, dataset.GetDoubleValues(variableName).ToList()); 387 406 } else if (dataset.VariableHasType<string>(variableName)) { 388 dataColumns.Add(new StringPreprocessingDataColumn(variableName, dataset.GetStringValues(variableName)));407 variableValues.Insert(columnIndex, dataset.GetStringValues(variableName).ToList()); 389 408 } else if (dataset.VariableHasType<DateTime>(variableName)) { 390 dataColumns.Add(new DateTimePreprocessingDataColumn(variableName, dataset.GetDateTimeValues(variableName)));409 variableValues.Insert(columnIndex, dataset.GetDateTimeValues(variableName).ToList()); 391 410 } else { 392 411 throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime"); 393 412 } 413 ++columnIndex; 394 414 } 395 415 … … 401 421 IList<IList> values = new List<IList>(); 402 422 403 for (int i = 0; i < Columns; i++) { 404 var doubleColumn = dataColumns[i] as DoublePreprocessingDataColumn; 405 var stringColumn = dataColumns[i] as StringPreprocessingDataColumn; 406 var dateTimeColumn = dataColumns[i] as DateTimePreprocessingDataColumn; 407 if (doubleColumn != null) values.Add(new List<double>(doubleColumn.GetValues())); 408 else if (stringColumn != null) values.Add(new List<string>(stringColumn.GetValues())); 409 else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn.GetValues())); 410 else throw new InvalidOperationException("Column type not supported for export"); 411 } 412 413 return new Dataset(VariableNames, values); 423 for (int i = 0; i < Columns; ++i) { 424 values.Add(variableValues[i]); 425 } 426 427 var dataset = new Dataset(variableNames, values); 428 return dataset; 414 429 } 415 430 #endregion … … 437 452 438 453 #region Transactions 439 // S napshot/History are not storable/cloneable on purpose454 // Stapshot/History are nost storable/cloneable on purpose 440 455 private class Snapshot { 441 public List<PreprocessingDataColumn> DataColumns { get; set; } 456 public IList<IList> VariableValues { get; set; } 457 public IList<string> VariableNames { get; set; } 442 458 443 459 public IntRange TrainingPartition { get; set; } … … 456 472 } 457 473 458 private const int M axUndoDepth= 5;474 private const int MAX_UNDO_DEPTH = 5; 459 475 460 476 private readonly IList<Snapshot> undoHistory = new List<Snapshot>(); … … 466 482 if (IsInTransaction) return; 467 483 468 var cloner = new Cloner();469 484 var currentSnapshot = new Snapshot { 470 DataColumns = new List<PreprocessingDataColumn>(dataColumns.Select(cloner.Clone)), 485 VariableValues = CopyVariableValues(variableValues), 486 VariableNames = new List<string>(variableNames), 471 487 TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End), 472 488 TestPartition = new IntRange(TestPartition.Start, TestPartition.End), … … 477 493 }; 478 494 479 if (undoHistory.Count >= M axUndoDepth)495 if (undoHistory.Count >= MAX_UNDO_DEPTH) 480 496 undoHistory.RemoveAt(0); 481 497 … … 490 506 if (IsUndoAvailable) { 491 507 Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1]; 492 dataColumns = previousSnapshot.DataColumns; 508 variableValues = previousSnapshot.VariableValues; 509 variableNames = previousSnapshot.VariableNames; 493 510 TrainingPartition = previousSnapshot.TrainingPartition; 494 511 TestPartition = previousSnapshot.TestPartition; … … 521 538 #endregion 522 539 523 /* #region Statistics 524 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 525 try { 526 return dataColumns[columnIndex].TypeSwitch<T>( 527 col => col.GetMin(considerSelection ? Selection[columnIndex] : null), 528 col => col.GetMin(considerSelection ? Selection[columnIndex] : null), 529 col => col.GetMin(considerSelection ? Selection[columnIndex] : null)); 530 } catch (InvalidOperationException) { 531 return emptyValue; 532 } 533 } 534 535 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 536 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 537 return values.Any() ? values.Max() : emptyValue; 538 } 539 540 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 541 return 542 543 544 if (typeof(T) == typeof(double)) { 545 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 546 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 547 } 548 if (typeof(T) == typeof(string)) { 549 return Convert<T>(string.Empty); 550 } 551 if (typeof(T) == typeof(DateTime)) { 552 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 553 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 554 } 555 556 throw new InvalidOperationException(typeof(T) + " not supported"); 557 } 558 559 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 560 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 561 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 562 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 563 } 564 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 565 return values.Any() ? values.Quantile(0.5) : emptyValue; 566 } 567 568 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 569 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 570 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 571 } 572 573 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 574 if (typeof(T) == typeof(double)) { 575 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 576 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 577 } 578 // For DateTime, std.dev / variance would have to be TimeSpan 579 //if (typeof(T) == typeof(DateTime)) { 580 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 581 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 582 //} 583 return default(T); 584 } 585 586 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 587 if (typeof(T) == typeof(double)) { 588 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 589 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 590 } 591 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 592 //if (typeof(T) == typeof(DateTime)) { 593 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 594 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 595 //} 596 return default(T); 597 } 598 599 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 600 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 601 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 602 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 603 } 604 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 605 return values.Any() ? values.Quantile(alpha) : emptyValue; 606 } 607 608 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 609 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 610 return values.GroupBy(x => x).Count(); 611 } 612 613 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 614 return GetValues<T>(columnIndex, considerSelection).Where(x => 615 ColumnTypeSwitch<T, bool>(dataColumns[columnIndex], x, 616 (c, v) => c.IsValidValue(v), 617 (c, v) => c.IsValidValue(v), 618 (c, v) => c.IsValidValue(v) 619 )); 620 } 621 622 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 623 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 624 } 625 626 public int GetMissingValueCount() { 627 int count = 0; 628 for (int i = 0; i < Columns; ++i) { 629 count += GetMissingValueCount(i); 630 } 631 return count; 632 } 633 public int GetMissingValueCount(int columnIndex) { 634 int sum = 0; 635 for (int i = 0; i < Rows; i++) { 636 if (IsCellEmpty(columnIndex, i)) 637 sum++; 638 } 639 return sum; 640 } 641 public int GetRowMissingValueCount(int rowIndex) { 642 int sum = 0; 643 for (int i = 0; i < Columns; i++) { 644 if (IsCellEmpty(i, rowIndex)) 645 sum++; 646 } 647 return sum; 648 } 649 #endregion */ 540 #region Statistics 541 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 542 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 543 return values.Any() ? values.Min() : emptyValue; 544 } 545 546 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 547 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 548 return values.Any() ? values.Max() : emptyValue; 549 } 550 551 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 552 if (typeof(T) == typeof(double)) { 553 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 554 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 555 } 556 if (typeof(T) == typeof(string)) { 557 return Convert<T>(string.Empty); 558 } 559 if (typeof(T) == typeof(DateTime)) { 560 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 561 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 562 } 563 564 throw new InvalidOperationException(typeof(T) + " not supported"); 565 } 566 567 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 568 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 569 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 570 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 571 } 572 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 573 return values.Any() ? values.Quantile(0.5) : emptyValue; 574 } 575 576 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 577 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 578 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 579 } 580 581 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 582 if (typeof(T) == typeof(double)) { 583 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 584 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 585 } 586 // For DateTime, std.dev / variance would have to be TimeSpan 587 //if (typeof(T) == typeof(DateTime)) { 588 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 589 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 590 //} 591 return default(T); 592 } 593 594 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 595 if (typeof(T) == typeof(double)) { 596 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 597 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 598 } 599 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 600 //if (typeof(T) == typeof(DateTime)) { 601 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 602 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 603 //} 604 return default(T); 605 } 606 607 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 608 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 609 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 610 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 611 } 612 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 613 return values.Any() ? values.Quantile(alpha) : emptyValue; 614 } 615 616 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 617 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 618 return values.GroupBy(x => x).Count(); 619 } 620 621 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 622 return GetValues<T>(columnIndex, considerSelection).Where(x => !IsMissingValue(x)); 623 } 624 625 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 626 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 627 } 628 private static T Convert<T>(object obj) { return (T)obj; } 629 630 public int GetMissingValueCount() { 631 int count = 0; 632 for (int i = 0; i < Columns; ++i) { 633 count += GetMissingValueCount(i); 634 } 635 return count; 636 } 637 public int GetMissingValueCount(int columnIndex) { 638 int sum = 0; 639 for (int i = 0; i < Rows; i++) { 640 if (IsCellEmpty(columnIndex, i)) 641 sum++; 642 } 643 return sum; 644 } 645 public int GetRowMissingValueCount(int rowIndex) { 646 int sum = 0; 647 for (int i = 0; i < Columns; i++) { 648 if (IsCellEmpty(i, rowIndex)) 649 sum++; 650 } 651 return sum; 652 } 653 #endregion 654 655 #region Helpers 656 private static IList<IList> CopyVariableValues(IList<IList> original) { 657 var copy = new List<IList>(original); 658 for (int i = 0; i < original.Count; ++i) { 659 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]); 660 } 661 return copy; 662 } 663 #endregion 650 664 } 651 665 -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/HeuristicLab.DataPreprocessing-3.4.csproj
r15291 r15431 123 123 <Compile Include="Content\ScatterPlotContent.cs" /> 124 124 <Compile Include="Content\DataCompletenessChartContent.cs" /> 125 <Compile Include="Data\Columns\DateTimePreprocessingDataColumn.cs" />126 <Compile Include="Data\Columns\DoublePreprocessingDataColumn.cs" />127 <Compile Include="Data\Columns\PreprocessingDataColumn.cs" />128 <Compile Include="Data\Columns\StringPreprocessingDataColumn.cs" />129 125 <Compile Include="Data\FilteredPreprocessingData.cs" /> 130 126 <Compile Include="Content\ManipulationContent.cs" /> -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/PreprocessingTransformator.cs
r15309 r15431 83 83 int colIndex = preprocessingData.GetColumnIndex(transformation.Column); 84 84 var originalData = preprocessingData.GetValues<double>(colIndex); 85 originalColumns.Add(transformation.Column, originalData .ToList());85 originalColumns.Add(transformation.Column, originalData); 86 86 } 87 87 } … … 107 107 } 108 108 109 private IEnumerable<double> ApplyDoubleTransformation(Transformation<double> transformation, I Enumerable<double> data, out bool success, out string errorMsg) {109 private IEnumerable<double> ApplyDoubleTransformation(Transformation<double> transformation, IList<double> data, out bool success, out string errorMsg) { 110 110 success = transformation.Check(data, out errorMsg); 111 111 // don't apply when the check fails -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/ProblemDataCreator.cs
r15309 r15431 129 129 } 130 130 131 private bool IsNotConstantInputVariable(I Enumerable<double> list) {131 private bool IsNotConstantInputVariable(IList<double> list) { 132 132 return context.Data.TrainingPartition.End - context.Data.TrainingPartition.Start > 1 || list.Range() > 0; 133 133 }
Note: See TracChangeset
for help on using the changeset viewer.