Changeset 15309
- Timestamp:
- 08/07/17 09:43:58 (7 years ago)
- Location:
- branches/DataPreprocessing Cleanup
- Files:
-
- 14 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing.Views/3.4/StatisticsView.cs
r15283 r15309 92 92 numericColumnsTextBox.Text = GetColumnCount<double>().ToString(); 93 93 nominalColumnsTextBox5.Text = GetColumnCount<string>().ToString(); 94 missingValuesTextBox.Text = data. GetMissingValueCount().ToString();95 totalValuesTextBox.Text = (data.Rows * data.Rows - data. GetMissingValueCount()).ToString();94 missingValuesTextBox.Text = data.DataColumns.Sum(c => c.GetNumberOfMissingValues()).ToString(); 95 totalValuesTextBox.Text = (data.Rows * data.Rows - data.DataColumns.Sum(c => c.GetNumberOfMissingValues())).ToString(); 96 96 97 97 var variableNames = Content.PreprocessingData.VariableNames.ToList(); … … 111 111 for (int j = 0; j < statistics.Count; j++) { 112 112 if (horizontal) 113 statisticsMatrix[j, i] = statistics[j] ;113 statisticsMatrix[j, i] = statistics[j].ToString(); 114 114 else 115 statisticsMatrix[i, j] = statistics[j] ;115 statisticsMatrix[i, j] = statistics[j].ToString(); 116 116 } 117 117 } … … 148 148 } 149 149 150 private List<string>GetStatistics(int varIdx) {151 List<string>list;150 private IList GetStatistics(int varIdx) { 151 IList list; 152 152 var data = Content.PreprocessingData; 153 153 if (data.VariableHasType<double>(varIdx)) { … … 166 166 } 167 167 168 private List<string>GetDoubleColumns(int statIdx) {169 var data = Content.PreprocessingData;170 return new List< string> {171 data.GetVariableType(statIdx).Name,172 data.GetMissingValueCount(statIdx).ToString(),173 data.GetMin<double>(statIdx, emptyValue: double.NaN).ToString(),174 data.GetMax<double>(statIdx, emptyValue: double.NaN).ToString(),175 data.GetMedian<double>(statIdx, emptyValue: double.NaN).ToString(),176 data.GetMean<double>(statIdx, emptyValue: double.NaN).ToString(),177 data.GetStandardDeviation<double>(statIdx, emptyValue: double.NaN).ToString(),178 data.GetVariance<double>(statIdx, emptyValue: double.NaN).ToString(),179 data.GetQuantile<double>(0.25, statIdx, emptyValue: double.NaN).ToString(),180 data.GetQuantile<double>(0.75, statIdx, emptyValue: double.NaN).ToString(),181 data.GetMode<double>(statIdx, emptyValue: double.NaN).ToString(),182 data.GetDistinctValues<double>(statIdx).ToString()168 private IList GetDoubleColumns(int statIdx) { 169 var column = (DoublePreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx]; 170 return new List<object> { 171 column.GetValueType().Name, 172 column.GetNumberOfMissingValues(), 173 column.GetMin(), 174 column.GetMax(), 175 column.GetMedian(), 176 column.GetMean(), 177 column.GetStandardDeviation(), 178 column.GetVariance(), 179 column.GetQuantile(0.25), 180 column.GetQuantile(0.75), 181 column.GetMode(), 182 column.GetDistinctValues() 183 183 }; 184 184 } 185 185 186 private List<string>GetStringColumns(int statIdx) {187 var data = Content.PreprocessingData;188 return new List< string> {189 data.GetVariableType(statIdx).Name,190 data.GetMissingValueCount(statIdx).ToString(),191 "", // data.GetMin<string>(statIdx, emptyValue: string.Empty), //min192 "", // data.GetMax<string>(statIdx, emptyValue: string.Empty), //max193 "", // data.GetMedian<string>(statIdx, emptyValue: string.Empty), //median186 private IList GetStringColumns(int statIdx) { 187 var column = (StringPreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx]; 188 return new List<object> { 189 column.GetValueType().Name, 190 column.GetNumberOfMissingValues(), 191 "", //min 192 "", //max 193 "", //median 194 194 "", //average 195 195 "", //standard deviation 196 196 "", //variance 197 "", // data.GetQuantile<string>(0.25, statIdx, emptyValue: string.Empty), //quarter percentile198 "", // data.GetQuantile<string>(0.75, statIdx, emptyValue: string.Empty), //three quarter percentile199 data.GetMode<string>(statIdx, emptyValue: string.Empty),200 data.GetDistinctValues<string>(statIdx).ToString()197 "", //quarter percentile 198 "", //three quarter percentile 199 column.GetMode(), 200 column.GetDistinctValues() 201 201 }; 202 202 } 203 203 204 private List<string>GetDateTimeColumns(int statIdx) {205 var data = Content.PreprocessingData;206 return new List< string> {207 data.GetVariableType(statIdx).Name,208 data.GetMissingValueCount(statIdx).ToString(),209 data.GetMin<DateTime>(statIdx).ToString(),210 data.GetMax<DateTime>(statIdx).ToString(),211 data.GetMedian<DateTime>(statIdx).ToString(),212 data.GetMean<DateTime>(statIdx).ToString(),213 "", // should be of type TimeSpan //data.GetStandardDeviation<DateTime>(statIdx).ToString(),214 "", // should be of type TimeSpan //data.GetVariance<DateTime>(statIdx).ToString(),215 data.GetQuantile<DateTime>(0.25, statIdx).ToString(),216 data.GetQuantile<DateTime>(0.75, statIdx).ToString(),217 data.GetMode<DateTime>(statIdx).ToString(),218 data.GetDistinctValues<DateTime>(statIdx).ToString()204 private IList GetDateTimeColumns(int statIdx) { 205 var column = (DateTimePreprocessingDataColumn)Content.PreprocessingData.DataColumns[statIdx]; 206 return new List<object> { 207 column.GetValueType().Name, 208 column.GetNumberOfMissingValues(), 209 column.GetMin(), 210 column.GetMax(), 211 column.GetMedian(), 212 column.GetMean(), 213 column.GetStandardDeviation(), 214 /*column.GetVariance()*/"", // variance (in ticks) is usually to high to display a valid TimeSpan or DateTime 215 column.GetQuantile(0.25), 216 column.GetQuantile(0.75), 217 column.GetMode(), 218 column.GetDistinctValues() 219 219 }; 220 220 } -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/DataGridContent.cs
r15285 r15309 90 90 91 91 public void DeleteRows(IEnumerable<int> rows) { 92 PreprocessingData.DeleteRows WithIndices(rows);92 PreprocessingData.DeleteRows(rows); 93 93 } 94 94 … … 134 134 135 135 #region Manipulations 136 private void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, Func<int, double> doubleAggregator = null,137 Func<int, DateTime> dateTimeAggregator = null, Func<int, string> stringAggregator = null) {138 PreprocessingData.InTransaction(() => {139 foreach (var column in cells) {140 if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) {141 var value = doubleAggregator(column.Key);142 foreach (int index in column.Value)143 PreprocessingData.SetCell<double>(column.Key, index, value);144 } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) {145 var value = dateTimeAggregator(column.Key);146 foreach (int index in column.Value)147 PreprocessingData.SetCell<DateTime>(column.Key, index, value);148 } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) {149 var value = stringAggregator(column.Key);150 foreach (int index in column.Value)151 PreprocessingData.SetCell<string>(column.Key, index, value);152 }153 }154 });155 }156 157 private void ReplaceIndicesByValues(IDictionary<int, IList<int>> cells, Func<int, IEnumerable<double>> doubleAggregator = null,158 Func<int, IEnumerable<DateTime>> dateTimeAggregator = null, Func<int, IEnumerable<string>> stringAggregator = null) {159 PreprocessingData.InTransaction(() => {160 foreach (var column in cells) {161 if (doubleAggregator != null && PreprocessingData.VariableHasType<double>(column.Key)) {162 var values = doubleAggregator(column.Key);163 foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value }))164 PreprocessingData.SetCell<double>(column.Key, pair.row, pair.value);165 } else if (dateTimeAggregator != null && PreprocessingData.VariableHasType<DateTime>(column.Key)) {166 var values = dateTimeAggregator(column.Key);167 foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value }))168 PreprocessingData.SetCell<DateTime>(column.Key, pair.row, pair.value);169 } else if (stringAggregator != null && PreprocessingData.VariableHasType<string>(column.Key)) {170 var values = stringAggregator(column.Key);171 foreach (var pair in column.Value.Zip(values, (row, value) => new { row, value }))172 PreprocessingData.SetCell<string>(column.Key, pair.row, pair.value);173 }174 }175 });176 }177 178 136 public void ReplaceIndicesByMean(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 179 ReplaceIndicesByValue(cells, 180 col => PreprocessingData.GetMean<double>(col, considerSelection), 181 col => PreprocessingData.GetMean<DateTime>(col, considerSelection)); 137 PreprocessingData.InTransaction(() => { 138 foreach (var column in cells) { 139 PreprocessingData.DataColumns[column.Key].TypeSwitch( 140 c => { 141 var mean = c.GetMean(considerSelection ? column.Value : null); 142 foreach (var index in column.Value) c[index] = mean; 143 }, 144 dateTimeAction: c => { 145 var mean = c.GetMean(considerSelection ? column.Value : null); 146 foreach (var index in column.Value) c[index] = mean; 147 }); 148 } 149 }); 182 150 } 183 151 184 152 public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 185 ReplaceIndicesByValue(cells, 186 col => PreprocessingData.GetMedian<double>(col, considerSelection), 187 col => PreprocessingData.GetMedian<DateTime>(col, considerSelection)); 153 PreprocessingData.InTransaction(() => { 154 foreach (var column in cells) { 155 PreprocessingData.DataColumns[column.Key].TypeSwitch( 156 c => { 157 var median = c.GetMedian(considerSelection ? column.Value : null); 158 foreach (var index in column.Value) c[index] = median; 159 }, 160 c => { 161 var median = c.GetMedian(considerSelection ? column.Value : null); 162 foreach (var index in column.Value) c[index] = median; 163 }, 164 c => { 165 var median = c.GetMedian(considerSelection ? column.Value : null); 166 foreach (var index in column.Value) c[index] = median; 167 }); 168 } 169 }); 188 170 } 189 171 190 172 public void ReplaceIndicesByMode(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 191 ReplaceIndicesByValue(cells, 192 col => PreprocessingData.GetMode<double>(col, considerSelection), 193 col => PreprocessingData.GetMode<DateTime>(col, considerSelection), 194 col => PreprocessingData.GetMode<string>(col, considerSelection)); 173 PreprocessingData.InTransaction(() => { 174 foreach (var column in cells) { 175 PreprocessingData.DataColumns[column.Key].TypeSwitch( 176 c => { 177 var mode = c.GetMode(considerSelection ? column.Value : null); 178 foreach (var index in column.Value) c[index] = mode; 179 }, 180 c => { 181 var mode = c.GetMode(considerSelection ? column.Value : null); 182 foreach (var index in column.Value) c[index] = mode; 183 }, 184 c => { 185 var mode = c.GetMode(considerSelection ? column.Value : null); 186 foreach (var index in column.Value) c[index] = mode; 187 }); 188 } 189 }); 195 190 } 196 191 197 192 public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) { 198 193 var rand = new FastRandom(); 199 ReplaceIndicesByValues(cells, 200 col => { 201 double min = PreprocessingData.GetMin<double>(col, considerSelection); 202 double max = PreprocessingData.GetMax<double>(col, considerSelection); 203 double range = max - min; 204 return cells[col].Select(_ => rand.NextDouble() * range + min); 205 }, 206 col => { 207 var min = PreprocessingData.GetMin<DateTime>(col, considerSelection); 208 var max = PreprocessingData.GetMax<DateTime>(col, considerSelection); 209 double range = (max - min).TotalSeconds; 210 return cells[col].Select(_ => min + TimeSpan.FromSeconds(rand.NextDouble() * range)); 211 }); 194 PreprocessingData.InTransaction(() => { 195 foreach (var column in cells) { 196 PreprocessingData.DataColumns[column.Key].TypeSwitch( 197 c => { 198 double min = c.GetMin(considerSelection ? column.Value : null); 199 double max = c.GetMax(considerSelection ? column.Value : null); 200 double range = max - min; 201 foreach (var index in column.Value) c[index] = min + rand.NextDouble() * range; 202 }, 203 dateTimeAction: c => { 204 var min = c.GetMin(considerSelection ? column.Value : null); 205 var max = c.GetMax(considerSelection ? column.Value : null); 206 double range = (max - min).TotalSeconds; 207 foreach (var index in column.Value) c[index] = min + TimeSpan.FromSeconds(rand.NextDouble() * range); 208 }); 209 } 210 }); 212 211 } 213 212 … … 216 215 foreach (var column in cells) { 217 216 foreach (var rowIdx in column.Value) { 218 PreprocessingData. SetValue(value, column.Key, rowIdx);217 PreprocessingData.DataColumns[column.Key].SetValue(value, rowIdx); 219 218 } 220 219 } … … 257 256 int valuesToInterpolate = nextIndex - prevIndex; 258 257 259 if (PreprocessingData.VariableHasType<double>(column.Key)) { 260 double prev = PreprocessingData.GetCell<double>(column.Key, prevIndex); 261 double next = PreprocessingData.GetCell<double>(column.Key, nextIndex); 262 double interpolationStep = (next - prev) / valuesToInterpolate; 263 264 for (int i = prevIndex; i < nextIndex; ++i) { 265 double interpolated = prev + (interpolationStep * (i - prevIndex)); 266 PreprocessingData.SetCell<double>(column.Key, i, interpolated); 267 } 268 } else if (PreprocessingData.VariableHasType<DateTime>(column.Key)) { 269 DateTime prev = PreprocessingData.GetCell<DateTime>(column.Key, prevIndex); 270 DateTime next = PreprocessingData.GetCell<DateTime>(column.Key, nextIndex); 271 double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate; 272 273 for (int i = prevIndex; i < nextIndex; ++i) { 274 DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex)); 275 PreprocessingData.SetCell<DateTime>(column.Key, i, interpolated); 276 } 277 } 258 PreprocessingData.DataColumns[column.Key].TypeSwitch( 259 c => { 260 double prev = c[prevIndex]; 261 double next = c[nextIndex]; 262 double interpolationStep = (next - prev) / valuesToInterpolate; 263 for (int i = prevIndex; i < nextIndex; i++) c[i] = prev + (interpolationStep * (i - prevIndex)); 264 }, 265 dateTimeAction: c => { 266 var prev = c[prevIndex]; 267 var next = c[nextIndex]; 268 double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate; 269 for (int i = prevIndex; i < nextIndex; i++) c[i] = prev.AddSeconds(interpolationStep * (i - prevIndex)); 270 } 271 ); 278 272 } 279 273 280 274 private int IndexOfPrevPresentValue(int columnIndex, int start) { 281 int offset = start - 1; 282 while (offset >= 0 && PreprocessingData.IsCellEmpty(columnIndex, offset)) { 283 offset--; 284 } 285 286 return offset; 275 int index = start - 1; 276 while (index >= 0 && PreprocessingData.IsCellEmpty(columnIndex, index)) 277 index--; 278 return index; 287 279 } 288 280 289 281 private int IndexOfNextPresentValue(int columnIndex, int start) { 290 int offset = start + 1; 291 while (offset < PreprocessingData.Rows && PreprocessingData.IsCellEmpty(columnIndex, offset)) { 292 offset++; 293 } 294 295 return offset; 282 int index = start + 1; 283 while (index < PreprocessingData.Rows && PreprocessingData.IsCellEmpty(columnIndex, index)) 284 index++; 285 return index; 296 286 } 297 287 … … 303 293 PreprocessingData.InTransaction(() => { 304 294 // process all given ranges - e.g. TrainingPartition, TestPartition 305 foreach ( IntRangerange in ranges) {295 foreach (var range in ranges) { 306 296 var indices = Enumerable.Range(0, PreprocessingData.Rows).ToArray(); 307 297 var shuffledIndices = Enumerable.Range(range.Start, range.Size).Shuffle(random).ToArray(); … … 324 314 public void ReOrderToIndices(int[] indices) { 325 315 PreprocessingData.InTransaction(() => { 326 for (int i = 0; i < PreprocessingData.Columns; ++i) {327 if (PreprocessingData.VariableHasType<double>(i))328 ReOrderToIndices<double>(i, indices);329 else if (PreprocessingData.VariableHasType<string>(i))330 ReOrderToIndices<string>(i, indices);331 else if (PreprocessingData.VariableHasType<DateTime>(i))332 ReOrderToIndices<DateTime>(i, indices);333 }334 });335 }336 337 private void ReOrderToIndices<T>(int columnIndex, int[] indices) {338 var originalData = new List<T>(PreprocessingData.GetValues<T>(columnIndex));339 if (indices.Length != originalData.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");340 341 for (int i = 0; i < indices.Length; i++) {342 T newValue = originalData[indices[i]];343 PreprocessingData.SetCell<T>(columnIndex, i, newValue);344 } 316 foreach (var column in PreprocessingData.DataColumns) { 317 column.TypeSwitch( 318 c => { 319 if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values."); 320 var originalData = new List<double>(c.Values); 321 for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]]; 322 }, 323 c => { 324 if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values."); 325 var originalData = new List<string>(c.Values); 326 for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]]; 327 }, 328 c => { 329 if (indices.Length != c.Values.Count) throw new InvalidOperationException("The number of provided indices does not match the values."); 330 var originalData = new List<DateTime>(c.Values); 331 for (int i = 0; i < indices.Length; i++) c[i] = originalData[indices[i]]; 332 }); 333 } 334 }); 345 335 } 346 336 #endregion -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/ManipulationContent.cs
r15285 r15309 57 57 58 58 for (int i = 0; i < PreprocessingData.Rows; ++i) { 59 int missingCount = PreprocessingData.GetRowMissingValueCount(i); 60 if (100f / PreprocessingData.Columns * missingCount > percent) { 59 int missingCount = 0; 60 for (var col = 0; col < PreprocessingData.DataColumns.Count; col++) { 61 if (!PreprocessingData.DataColumns[col].IsValidValue(i)) 62 missingCount++; 63 } 64 if (100f / PreprocessingData.Columns * missingCount > percent) 61 65 rows.Add(i); 62 }63 66 } 64 67 … … 69 72 List<int> columns = new List<int>(); 70 73 for (int i = 0; i < PreprocessingData.Columns; ++i) { 71 int missingCount = PreprocessingData. GetMissingValueCount(i);74 int missingCount = PreprocessingData.DataColumns[i].GetNumberOfMissingValues(); 72 75 if (100f / PreprocessingData.Rows * missingCount > percent) { 73 76 columns.Add(i); … … 80 83 public List<int> ColumnsWithVarianceSmaller(double variance) { 81 84 List<int> columns = new List<int>(); 82 for (int i = 0; i < PreprocessingData.Columns; ++i) { 83 if (PreprocessingData.VariableHasType<double>(i)) { 84 double columnVariance = PreprocessingData.GetVariance<double>(i); 85 if (columnVariance < variance) { 86 columns.Add(i); 87 } 88 } else if (PreprocessingData.VariableHasType<DateTime>(i)) { 89 double columnVariance = (double)PreprocessingData.GetVariance<DateTime>(i).Ticks / TimeSpan.TicksPerSecond; 90 if (columnVariance < variance) { 91 columns.Add(i); 92 } 93 } 85 86 for (int i = 0; i < PreprocessingData.Columns; i++) { 87 if (PreprocessingData.DataColumns[i].TypeSwitch<bool>( 88 c => c.GetVariance() < variance, 89 c => false, 90 c => c.GetVariance().Ticks / TimeSpan.TicksPerSecond < variance 91 )) 92 columns.Add(i); 94 93 } 94 95 95 return columns; 96 96 } … … 119 119 PreprocessingData.InTransaction(() => { 120 120 foreach (int column in columns.OrderByDescending(x => x)) { 121 PreprocessingData.D eleteColumn(column);121 PreprocessingData.DataColumns.RemoveAt(column); 122 122 } 123 123 }); -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/PreprocessingChartContent.cs
r15274 r15309 82 82 83 83 public static DataRow CreateDataRow(IFilteredPreprocessingData preprocessingData, string variableName, DataRowVisualProperties.DataRowChartType chartType) { 84 IList<double>values = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableName));84 var values = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableName)); 85 85 DataRow row = new DataRow(variableName, "", values); 86 86 row.VisualProperties.ChartType = chartType; -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Content/ScatterPlotContent.cs
r15274 r15309 21 21 22 22 using System; 23 using System.Collections.Generic;24 23 using System.Linq; 25 24 using HeuristicLab.Analysis; … … 51 50 #endregion 52 51 53 public static ScatterPlot CreateScatterPlot(IFilteredPreprocessingData preprocessingData, string variableNameX, string variableNameY, string variableNameGroup = "-", LegendOrder legendOrder = LegendOrder.Alphabetically) { 52 public static ScatterPlot CreateScatterPlot(IFilteredPreprocessingData preprocessingData, string variableNameX, string variableNameY, 53 string variableNameGroup = "-", LegendOrder legendOrder = LegendOrder.Alphabetically) { 54 54 ScatterPlot scatterPlot = new ScatterPlot(); 55 55 56 IList<double>xValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameX));57 IList<double>yValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameY));56 var xValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameX)); 57 var yValues = preprocessingData.GetValues<double>(preprocessingData.GetColumnIndex(variableNameY)); 58 58 59 59 var points = xValues.Zip(yValues, (x, y) => new Point2D<double>(x, y)).ToList(); -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/Columns/DateTimePreprocessingDataColumn.cs
r15291 r15309 29 29 namespace HeuristicLab.DataPreprocessing { 30 30 [Item("DateTimePreprocessingDataColumn", "")] 31 public class DateTimePreprocessingDataColumn : NullablePreprocessingDataColumn<DateTime, TimeSpan> {31 public class DateTimePreprocessingDataColumn : PreprocessingDataColumn<DateTime> { 32 32 33 33 #region Constructor, Cloning & Persistence … … 35 35 : base() { } 36 36 public DateTimePreprocessingDataColumn(string name, IEnumerable<DateTime> values) 37 : base(name, values) {38 }39 public DateTimePreprocessingDataColumn(string name, IEnumerable<DateTime?> values)40 37 : base(name, values) { 41 38 } … … 53 50 #endregion 54 51 52 protected override DateTime DefaultValue { get { return DateTime.MinValue; } } 53 55 54 #region Statistics 56 public override TimeSpan GetRange() { return ValidValues.Max() - ValidValues.Min(); }57 public override DateTime GetMean() { return AggregateAsDouble(ValidValues, Enumerable.Average); }58 public override TimeSpan GetStandardDeviation() { return AggregateDistanceAsDouble(ValidValues, EnumerableStatisticExtensions.StandardDeviation); }59 public override TimeSpan GetVariance() { return AggregateDistanceAsDouble(ValidValues, EnumerableStatisticExtensions.Variance); }55 public TimeSpan GetRange(IEnumerable<int> indices = null) { return GetMax(indices) - GetMin(indices); } 56 public DateTime GetMean(IEnumerable<int> indices = null) { return AggregateAsDouble(GetValidValues(indices), Enumerable.Average); } 57 public TimeSpan GetStandardDeviation(IEnumerable<int> indices = null) { return AggregateDistanceAsDouble(GetValidValues(indices), EnumerableStatisticExtensions.StandardDeviation); } 58 public TimeSpan GetVariance(IEnumerable<int> indices = null) { return AggregateDistanceAsDouble(GetValidValues(indices), EnumerableStatisticExtensions.Variance); } 60 59 #endregion 61 60 … … 69 68 public override string GetValue(int index) { 70 69 var value = Values[index]; 71 return value.HasValue ? value.Value.ToString("o") : string.Empty;70 return IsValidValue(value) ? Values[index].ToString("o") : string.Empty; // format "s" sortable or "o" roundtrip 72 71 } 73 72 public override bool SetValue(string value, int index) { … … 77 76 return true; 78 77 } else if (string.IsNullOrEmpty(value)) { 79 Values[index] = null;78 Values[index] = DateTime.MinValue; 80 79 return true; 81 80 } else { … … 86 85 87 86 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 88 return new DateTime((long)func(values.Select(x => (double)x.Ticks)));87 return values.Any() ? new DateTime((long)func(values.Select(x => (double)x.Ticks))) : DateTime.MinValue; 89 88 } 90 89 private static TimeSpan AggregateDistanceAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 91 return new TimeSpan((long)func(values.Select(x => (double)x.Ticks)));90 return values.Any() ? new TimeSpan((long)func(values.Select(x => (double)x.Ticks))) : TimeSpan.Zero; 92 91 } 93 92 } -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/Columns/DoublePreprocessingDataColumn.cs
r15291 r15309 31 31 [Item("DoublePreprocessingDataColumn", "")] 32 32 [StorableClass] 33 public sealed class DoublePreprocessingDataColumn : NullablePreprocessingDataColumn<double,double> {33 public sealed class DoublePreprocessingDataColumn : PreprocessingDataColumn<double> { 34 34 35 35 #region Constructor, Cloning & Persistence 36 36 37 public DoublePreprocessingDataColumn() 37 38 : base() { } 39 38 40 public DoublePreprocessingDataColumn(string name, IEnumerable<double> values) 39 : base(name, values) { 40 } 41 public DoublePreprocessingDataColumn(string name, IEnumerable<double?> values) 42 : base(name, values) { 43 } 41 : base(name, values) { } 44 42 45 43 private DoublePreprocessingDataColumn(DoublePreprocessingDataColumn original, Cloner cloner) 46 : base(original, cloner) { 47 } 44 : base(original, cloner) { } 45 48 46 public override IDeepCloneable Clone(Cloner cloner) { 49 47 return new DoublePreprocessingDataColumn(this, cloner); … … 53 51 private DoublePreprocessingDataColumn(bool deserializing) 54 52 : base(deserializing) { } 53 55 54 #endregion 56 55 … … 59 58 } 60 59 60 protected override double DefaultValue { get { return double.NaN; } } 61 61 62 #region Statistics 62 public override double GetRange() { return ValidValues.Max() - ValidValues.Min(); }63 public override double GetMean() { return ValidValues.Average(); }64 public override double GetMedian( ) { return ValidValues.Quantile(0.5); } // IEnumerable<doube> version is faster65 public override double GetStandardDeviation() { return ValidValues.StandardDeviation(); }66 public override double GetVariance() { return ValidValues.Variance(); }67 public override double GetQuantile(double alpha ) { return ValidValues.Quantile(alpha); } // IEnumerable<doube> version is faster63 public double GetRange(IEnumerable<int> indices = null) { return GetMax(indices) - GetMin(indices); } 64 public double GetMean(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Average(); } 65 public override double GetMedian(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Median(); } // IEnumerable<doube> version is faster 66 public double GetStandardDeviation(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).StandardDeviation(); } 67 public double GetVariance(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Variance(); } 68 public override double GetQuantile(double alpha, IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(double.NaN).Quantile(alpha); } // IEnumerable<doube> version is faster 68 69 #endregion 69 70 … … 83 84 } 84 85 public override string GetValue(int index) { 85 var value = Values[index]; 86 return value.HasValue ? value.Value.ToString("r") : string.Empty; 86 return Values[index].ToString("r"); 87 87 } 88 88 public override bool SetValue(string value, int index) { … … 92 92 return true; 93 93 } else if (string.IsNullOrEmpty(value)) { 94 Values[index] = null;94 Values[index] = double.NaN; 95 95 return true; 96 96 } else { -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/Columns/PreprocessingDataColumn.cs
r15291 r15309 54 54 public abstract bool IsValidValue(int index); 55 55 56 #region Column Type Switches 57 internal void TypeSwitch(Action<DoublePreprocessingDataColumn> doubleAction, Action<StringPreprocessingDataColumn> stringAction = null, Action<DateTimePreprocessingDataColumn> dateTimeAction = null) { 58 var doubleColumn = this as DoublePreprocessingDataColumn; 59 if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn); 60 var stringColumn = this as StringPreprocessingDataColumn; 61 if (stringColumn != null && stringAction != null) stringAction(stringColumn); 62 var dateTimeColumn = this as DateTimePreprocessingDataColumn; 63 if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn); 64 } 65 internal void TypeSwitch<TIn>(TIn value, Action<DoublePreprocessingDataColumn, double> doubleAction, Action<StringPreprocessingDataColumn, string> stringAction = null, Action<DateTimePreprocessingDataColumn, DateTime> dateTimeAction = null) { 66 var doubleColumn = this as DoublePreprocessingDataColumn; 67 if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn, Convert<double>(value)); 68 var stringColumn = this as StringPreprocessingDataColumn; 69 if (stringColumn != null && stringAction != null) stringAction(stringColumn, Convert<string>(value)); 70 var dateTimeColumn = this as DateTimePreprocessingDataColumn; 71 if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn, Convert<DateTime>(value)); 72 } 73 74 internal TOut TypeSwitch<TOut>(Func<DoublePreprocessingDataColumn, double> doubleFunc, Func<StringPreprocessingDataColumn, string> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime> dateTimeFunc = null) { 75 var doubleColumn = this as DoublePreprocessingDataColumn; 76 if (doubleColumn != null && doubleFunc != null) return Convert<TOut>(doubleFunc(doubleColumn)); 77 var stringColumn = this as StringPreprocessingDataColumn; 78 if (stringColumn != null && stringFunc != null) return Convert<TOut>(stringFunc(stringColumn)); 79 var dateTimeColumn = this as DateTimePreprocessingDataColumn; 80 if (dateTimeColumn != null && dateTimeFunc != null) return Convert<TOut>(dateTimeFunc(dateTimeColumn)); 81 throw new InvalidOperationException("Invalid data column type."); 82 } 83 internal TOut TypeSwitch<TOut>(Func<DoublePreprocessingDataColumn, TOut> doubleFunc, Func<StringPreprocessingDataColumn, TOut> stringFunc = null, Func<DateTimePreprocessingDataColumn, TOut> dateTimeFunc = null) { 84 var doubleColumn = this as DoublePreprocessingDataColumn; 85 if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn); 86 var stringColumn = this as StringPreprocessingDataColumn; 87 if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn); 88 var dateTimeColumn = this as DateTimePreprocessingDataColumn; 89 if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn); 90 throw new InvalidOperationException("Invalid data column type."); 91 } 92 internal TOut TypeSwitch<TIn, TOut>(TIn value, Func<DoublePreprocessingDataColumn, double, TOut> doubleFunc, Func<StringPreprocessingDataColumn, string, TOut> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime, TOut> dateTimeFunc = null) { 93 var doubleColumn = this as DoublePreprocessingDataColumn; 94 if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn, Convert<double>(value)); 95 var stringColumn = this as StringPreprocessingDataColumn; 96 if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn, Convert<string>(value)); 97 var dateTimeColumn = this as DateTimePreprocessingDataColumn; 98 if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn, Convert<DateTime>(value)); 99 throw new InvalidOperationException("Invalid data column type."); 100 } 101 internal IEnumerable<TOut> TypeSwitch<TOut>(Func<DoublePreprocessingDataColumn, IEnumerable<double>> doubleFunc, Func<StringPreprocessingDataColumn, IEnumerable<string>> stringFunc = null, Func<DateTimePreprocessingDataColumn, IEnumerable<DateTime>> dateTimeFunc = null) { 102 var doubleColumn = this as DoublePreprocessingDataColumn; 103 if (doubleColumn != null && doubleFunc != null) return Convert<IEnumerable<TOut>>(doubleFunc(doubleColumn)); 104 var stringColumn = this as StringPreprocessingDataColumn; 105 if (stringColumn != null && stringFunc != null) return Convert<IEnumerable<TOut>>(stringFunc(stringColumn)); 106 var dateTimeColumn = this as DateTimePreprocessingDataColumn; 107 if (dateTimeColumn != null && dateTimeFunc != null) return Convert<IEnumerable<TOut>>(dateTimeFunc(dateTimeColumn)); 108 throw new InvalidOperationException("Invalid data column type."); 109 } 110 internal IEnumerable<TOut> TypeSwitch<TOut, TIn>(TIn value, Func<DoublePreprocessingDataColumn, double, IEnumerable<double>> doubleFunc, Func<StringPreprocessingDataColumn, string, IEnumerable<string>> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime, IEnumerable<DateTime>> dateTimeFunc = null) { 111 var doubleColumn = this as DoublePreprocessingDataColumn; 112 if (doubleColumn != null && doubleFunc != null) return Convert<IEnumerable<TOut>>(doubleFunc(doubleColumn, Convert<double>(value))); 113 var stringColumn = this as StringPreprocessingDataColumn; 114 if (stringColumn != null && stringFunc != null) return Convert<IEnumerable<TOut>>(stringFunc(stringColumn, Convert<string>(value))); 115 var dateTimeColumn = this as DateTimePreprocessingDataColumn; 116 if (dateTimeColumn != null && dateTimeFunc != null) return Convert<IEnumerable<TOut>>(dateTimeFunc(dateTimeColumn, Convert<DateTime>(value))); 117 throw new InvalidOperationException("Invalid data column type."); 118 } 119 120 private static T Convert<T>(object obj) { return (T)obj; } 121 #endregion 122 123 #region Statistics 124 public abstract int GetDistinctValues(IEnumerable<int> indices = null); 125 public abstract int GetNumberOfMissingValues(IEnumerable<int> indices = null); 126 #endregion 56 127 57 128 #region String Handling … … 64 135 [Item("PreprocessingDataColumn", "")] 65 136 [StorableClass] 66 public abstract class PreprocessingDataColumn<T Value, TDistance> : PreprocessingDataColumn67 where T Value : class, IComparable<TValue> {137 public abstract class PreprocessingDataColumn<T> : PreprocessingDataColumn 138 where T : IComparable<T> { 68 139 69 140 #region Constructor, Cloning & Persistence 70 141 protected PreprocessingDataColumn() 71 : this(string.Empty, Enumerable.Empty<T Value>()) { }72 protected PreprocessingDataColumn(string name, IEnumerable<T Value> values)142 : this(string.Empty, Enumerable.Empty<T>()) { } 143 protected PreprocessingDataColumn(string name, IEnumerable<T> values) 73 144 : base(name) { 74 Values = new List<T Value>(values);75 } 76 77 protected PreprocessingDataColumn(PreprocessingDataColumn<T Value, TDistance> original, Cloner cloner)145 Values = new List<T>(values); 146 } 147 148 protected PreprocessingDataColumn(PreprocessingDataColumn<T> original, Cloner cloner) 78 149 : base(original, cloner) { 79 Values = new List<T Value>(original.Values);150 Values = new List<T>(original.Values); 80 151 } 81 152 … … 86 157 87 158 [Storable] 88 public List<TValue> Values { get; private set; } 89 public IEnumerable<TValue> ValidValues { 90 get { return Values.Where(IsValidValue); } 91 } 159 internal List<T> Values { get; private set; } 160 public IEnumerable<T> GetValues(IEnumerable<int> indices = null) { 161 return indices == null 162 ? Values 163 : indices.Select(index => Values[index]); 164 } 165 public IEnumerable<T> GetValidValues(IEnumerable<int> indices = null) { 166 return indices == null 167 ? Values.Where(IsValidValue) 168 : indices.Select(index => Values[index]).Where(IsValidValue); 169 } 170 171 protected abstract T DefaultValue { get; } 92 172 93 173 public override Type GetValueType() { 94 return typeof(T Value);174 return typeof(T); 95 175 } 96 176 … … 99 179 } 100 180 101 public T Valuethis[int index] {181 public T this[int index] { 102 182 get { return Values[index]; } 103 183 set { Values[index] = value; } 104 184 } 105 185 106 public virtual bool IsValidValue(T Valuevalue) { return true; }186 public virtual bool IsValidValue(T value) { return true; } 107 187 public override bool IsValidValue(int index) { 108 188 return IsValidValue(Values[index]); … … 110 190 111 191 #region Statistics 112 public virtual TValue GetMin() { return Values.Min(); } 113 public virtual TValue GetMax() { return Values.Max(); } 114 public abstract TDistance GetRange(); 115 public abstract TValue GetMean(); 116 public virtual TValue GetMedian() { return Values.Quantile(0.5); } 117 public virtual TValue GetMode() { return Values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First(); } 118 public abstract TDistance GetStandardDeviation(); 119 public abstract TDistance GetVariance(); 120 public virtual TValue GetQuantile(double alpha) { return Values.Quantile(alpha); } 121 public virtual int GetDistinctValues() { return Values.GroupBy(x => x).Count(); } 122 public virtual int GetNumberOfMissingValues() { return Values.Count(IsValidValue); } 123 #endregion 124 } 125 126 [Item("NullablePreprocessingDataColumn", "")] 127 [StorableClass] 128 public abstract class NullablePreprocessingDataColumn<TValue, TDistance> : PreprocessingDataColumn 129 where TValue : struct, IComparable<TValue> { 130 131 #region Constructor, Cloning & Persistence 132 protected NullablePreprocessingDataColumn() 133 : this(string.Empty, Enumerable.Empty<TValue?>()) { } 134 protected NullablePreprocessingDataColumn(string name, IEnumerable<TValue> values) 135 : this(name, values.Select(x => (TValue?)x)) { } 136 protected NullablePreprocessingDataColumn(string name, IEnumerable<TValue?> values) 137 : base(name) { 138 Values = new List<TValue?>(values); 139 } 140 141 protected NullablePreprocessingDataColumn(NullablePreprocessingDataColumn<TValue, TDistance> original, Cloner cloner) 142 : base(original, cloner) { 143 Values = new List<TValue?>(original.Values); 144 } 145 146 [StorableConstructor] 147 protected NullablePreprocessingDataColumn(bool deserializing) 148 : base(deserializing) { } 149 #endregion 150 151 [Storable] 152 internal List<TValue?> Values { get; private set; } 153 protected IEnumerable<TValue> ValidValues { 154 get { return Values.Where(x => x.HasValue && IsValidValue(x.Value)).Select(x => x.Value); } 155 } 156 157 public override Type GetValueType() { 158 return typeof(TValue); 159 } 160 161 public override int Length { 162 get { return Values.Count; } 163 } 164 165 public TValue? this[int index] { 166 get { return Values[index]; } 167 set { Values[index] = value; } 168 } 169 170 public virtual bool IsValidValue(TValue value) { return true; } 171 public override bool IsValidValue(int index) { 172 var value = Values[index]; 173 return value.HasValue && IsValidValue(value.Value); 174 } 175 176 #region Statistics 177 public virtual TValue GetMin() { return ValidValues.Min(); } 178 public virtual TValue GetMax() { return ValidValues.Max(); } 179 public abstract TDistance GetRange(); 180 public abstract TValue GetMean(); 181 public virtual TValue GetMedian() { return ValidValues.Quantile(0.5); } 182 public virtual TValue GetMode() { return ValidValues.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First(); } 183 public abstract TDistance GetStandardDeviation(); 184 public abstract TDistance GetVariance(); 185 public virtual TValue GetQuantile(double alpha) { return ValidValues.Quantile(alpha); } 186 public virtual int GetDistinctValues() { return ValidValues.GroupBy(x => x).Count(); } 187 public virtual int GetNumberOfMissingValues() { return Values.Count - ValidValues.Count(); } 192 193 public virtual T GetMin(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Min(); } 194 public virtual T GetMax(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Max(); } 195 public virtual T GetMedian(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Quantile(0.5); } 196 public virtual T GetMode(IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First(); } 197 public virtual T GetQuantile(double alpha, IEnumerable<int> indices = null) { return GetValidValues(indices).DefaultIfEmpty(DefaultValue).Quantile(alpha); } 198 public override int GetDistinctValues(IEnumerable<int> indices = null) { return GetValidValues(indices).GroupBy(x => x).Count(); } 199 public override int GetNumberOfMissingValues(IEnumerable<int> indices = null) { return GetValues(indices).Count(x => !IsValidValue(x)); } 188 200 #endregion 189 201 } -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/Columns/StringPreprocessingDataColumn.cs
r15291 r15309 28 28 [Item("StringPreprocessingDataColumn", "")] 29 29 [StorableClass] 30 public sealed class StringPreprocessingDataColumn : PreprocessingDataColumn<string , string> {30 public sealed class StringPreprocessingDataColumn : PreprocessingDataColumn<string> { 31 31 32 32 #region Constructor, Cloning & Persistence … … 53 53 } 54 54 55 #region Statistics 56 public override string GetRange() { return string.Empty; } 57 public override string GetMean() { return string.Empty; } 58 public override string GetStandardDeviation() { return string.Empty; } 59 public override string GetVariance() { return string.Empty; } 60 #endregion 55 protected override string DefaultValue { get { return string.Empty; } } 61 56 62 57 #region IStringConvertibleColumn 63 58 public override bool Validate(string value, out string errorMessage) { 64 if (value == null) { 65 errorMessage = "Invalid Value (string must not be null)"; 66 return false; 67 } else { 68 errorMessage = string.Empty; 69 return true; 70 } 59 errorMessage = string.Empty; 60 return true; 71 61 } 72 62 public override string GetValue(int index) { -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/FilteredPreprocessingData.cs
r15283 r15309 38 38 private IPreprocessingData filteredData; 39 39 40 public IList<PreprocessingDataColumn> DataColumns { 41 get { return ActiveData.DataColumns; } 42 } 43 40 44 public IPreprocessingData ActiveData { 41 45 get { return IsFiltered ? filteredData : originalData; } … … 82 86 } 83 87 84 public I List<T> GetValues<T>(int columnIndex, bool considerSelection) {88 public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) { 85 89 return ActiveData.GetValues<T>(columnIndex, considerSelection); 86 90 } 87 91 88 public void SetValues<T>(int columnIndex, I List<T> values) {92 public void SetValues<T>(int columnIndex, IEnumerable<T> values) { 89 93 if (IsFiltered) 90 94 throw new InvalidOperationException("SetValues not possible while data is filtered"); … … 123 127 } 124 128 125 public void DeleteRows WithIndices(IEnumerable<int> rows) {129 public void DeleteRows(IEnumerable<int> rows) { 126 130 if (IsFiltered) 127 131 throw new InvalidOperationException("DeleteRowsWithIndices not possible while data is filtered"); 128 132 129 originalData.DeleteRows WithIndices(rows);133 originalData.DeleteRows(rows); 130 134 } 131 135 … … 273 277 public void EndTransaction() { 274 278 originalData.EndTransaction(); 275 }276 #endregion277 278 #region Statistics279 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {280 return ActiveData.GetMin<T>(columnIndex, considerSelection, emptyValue);281 }282 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {283 return ActiveData.GetMax<T>(columnIndex, considerSelection, emptyValue);284 }285 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {286 return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue);287 }288 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {289 return ActiveData.GetMean<T>(columnIndex, considerSelection, emptyValue);290 }291 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> {292 return ActiveData.GetMode<T>(columnIndex, considerSelection, emptyValue);293 }294 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {295 return ActiveData.GetStandardDeviation<T>(columnIndex, considerSelection, emptyValue);296 }297 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) {298 return ActiveData.GetVariance<T>(columnIndex, considerSelection, emptyValue);299 }300 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> {301 return ActiveData.GetQuantile<T>(alpha, columnIndex, considerSelection, emptyValue);302 }303 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) {304 return ActiveData.GetDistinctValues<T>(columnIndex, considerSelection);305 }306 307 public int GetMissingValueCount() {308 return ActiveData.GetMissingValueCount();309 }310 public int GetMissingValueCount(int columnIndex) {311 return ActiveData.GetMissingValueCount(columnIndex);312 }313 public int GetRowMissingValueCount(int rowIndex) {314 return ActiveData.GetRowMissingValueCount(rowIndex);315 279 } 316 280 #endregion -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/IPreprocessingData.cs
r15283 r15309 28 28 namespace HeuristicLab.DataPreprocessing { 29 29 public interface IPreprocessingData : INamedItem { 30 31 IList<PreprocessingDataColumn> DataColumns { get; } 32 30 33 #region Cells 31 34 bool IsCellEmpty(int columnIndex, int rowIndex); … … 36 39 string GetCellAsString(int columnIndex, int rowIndex); 37 40 38 I List<T> GetValues<T>(int columnIndex, bool considerSelection = false);41 IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection = false); 39 42 40 void SetValues<T>(int columnIndex, I List<T> values);43 void SetValues<T>(int columnIndex, IEnumerable<T> values); 41 44 bool SetValue(string value, int columnIndex, int rowIndex); 42 45 … … 48 51 void InsertRow(int rowIndex); 49 52 void DeleteRow(int rowIndex); 50 void DeleteRows WithIndices(IEnumerable<int> rows);53 void DeleteRows(IEnumerable<int> rows); 51 54 void InsertColumn<T>(string variableName, int columnIndex); 52 55 … … 106 109 void EndTransaction(); 107 110 #endregion 108 109 #region Statistics110 T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));111 T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));112 T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));113 T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>;114 T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T>;115 T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));116 T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T));117 T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T>;118 int GetDistinctValues<T>(int columnIndex, bool considerSelection = false);119 120 int GetMissingValueCount();121 int GetMissingValueCount(int columnIndex);122 int GetRowMissingValueCount(int rowIndex);123 #endregion124 111 } 125 112 } -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Data/PreprocessingData.cs
r15291 r15309 36 36 public class PreprocessingData : NamedItem, IPreprocessingData { 37 37 38 [Storable] 39 protected List<PreprocessingDataColumn> dataColumns; 38 [Storable] private List<PreprocessingDataColumn> dataColumns; 39 40 public IList<PreprocessingDataColumn> DataColumns { 41 get { return dataColumns; } 42 } 43 40 44 41 45 #region Constructor, Cloning & Persistence … … 99 103 } 100 104 101 private void ColumnTypeSwitchAction<T>(int columnIndex, T value, Action<DoublePreprocessingDataColumn, double?> doubleAction,102 Action<StringPreprocessingDataColumn, string> stringAction = null, Action<DateTimePreprocessingDataColumn, DateTime?> dateTimeAction = null) {103 ColumnTypeSwitchAction(dataColumns[columnIndex], value, doubleAction, stringAction, dateTimeAction);104 }105 private void ColumnTypeSwitchAction<T>(PreprocessingDataColumn column, T value, Action<DoublePreprocessingDataColumn, double?> doubleAction,106 Action<StringPreprocessingDataColumn, string> stringAction = null, Action<DateTimePreprocessingDataColumn, DateTime?> dateTimeAction = null) {107 var doubleColumn = column as DoublePreprocessingDataColumn;108 if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn, Convert<double?>(value));109 var stringColumn = column as StringPreprocessingDataColumn;110 if (stringColumn != null && stringAction != null) stringAction(stringColumn, Convert<string>(value));111 var dateTimeColumn = column as DateTimePreprocessingDataColumn;112 if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn, Convert<DateTime?>(value));113 }114 115 private void ColumnTypeSwitchAction(int columnIndex, Action<DoublePreprocessingDataColumn> doubleAction,116 Action<StringPreprocessingDataColumn> stringAction = null, Action<DateTimePreprocessingDataColumn> dateTimeAction = null) {117 ColumnTypeSwitchAction(dataColumns[columnIndex], doubleAction, stringAction, dateTimeAction);118 }119 private void ColumnTypeSwitchAction(PreprocessingDataColumn column, Action<DoublePreprocessingDataColumn> doubleAction,120 Action<StringPreprocessingDataColumn> stringAction = null, Action<DateTimePreprocessingDataColumn> dateTimeAction = null) {121 var doubleColumn = column as DoublePreprocessingDataColumn;122 if (doubleColumn != null && doubleAction != null) doubleAction(doubleColumn);123 var stringColumn = column as StringPreprocessingDataColumn;124 if (stringColumn != null && stringAction != null) stringAction(stringColumn);125 var dateTimeColumn = column as DateTimePreprocessingDataColumn;126 if (dateTimeColumn != null && dateTimeAction != null) dateTimeAction(dateTimeColumn);127 }128 129 130 private T ColumnTypeSwitchFunc<T>(int columnIndex, Func<DoublePreprocessingDataColumn, double?> doubleFunc,131 Func<StringPreprocessingDataColumn, string> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime?> dateTimeFunc = null) {132 var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;133 if (doubleColumn != null && doubleFunc != null) return Convert<T>(doubleFunc(doubleColumn));134 var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;135 if (stringColumn != null && stringFunc != null) return Convert<T>(stringFunc(stringColumn));136 var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;137 if (dateTimeColumn != null && dateTimeFunc != null) return Convert<T>(dateTimeFunc(dateTimeColumn));138 throw new InvalidOperationException("Invalid data column type.");139 }140 141 private T ColumnTypeSwitchFuncResult<T>(int columnIndex, Func<DoublePreprocessingDataColumn, T> doubleFunc,142 Func<StringPreprocessingDataColumn, T> stringFunc = null, Func<DateTimePreprocessingDataColumn, T> dateTimeFunc = null) {143 var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;144 if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn);145 var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;146 if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn);147 var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;148 if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn);149 throw new InvalidOperationException("Invalid data column type.");150 }151 private TOut ColumnTypeSwitchFuncResult<TIn, TOut>(int columnIndex, TIn value, Func<DoublePreprocessingDataColumn, double?, TOut> doubleFunc,152 Func<StringPreprocessingDataColumn, string, TOut> stringFunc = null, Func<DateTimePreprocessingDataColumn, DateTime?, TOut> dateTimeFunc = null) {153 var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;154 if (doubleColumn != null && doubleFunc != null) return doubleFunc(doubleColumn, Convert<double?>(value));155 var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;156 if (stringColumn != null && stringFunc != null) return stringFunc(stringColumn, Convert<string>(value));157 var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;158 if (dateTimeColumn != null && dateTimeFunc != null) return dateTimeFunc(dateTimeColumn, Convert<DateTime?>(value));159 throw new InvalidOperationException("Invalid data column type.");160 }161 162 private IList<T> ColumnTypeSwitchFuncList<T>(int columnIndex, Func<DoublePreprocessingDataColumn, IList<double>> doubleFunc,163 Func<StringPreprocessingDataColumn, IList<string>> stringFunc = null, Func<DateTimePreprocessingDataColumn, IList<DateTime>> dateTimeFunc = null) {164 var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn;165 if (doubleColumn != null && doubleFunc != null) return Convert<IList<T>>(doubleFunc(doubleColumn));166 var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn;167 if (stringColumn != null && stringFunc != null) return Convert<IList<T>>(stringFunc(stringColumn));168 var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn;169 if (dateTimeColumn != null && dateTimeFunc != null) return Convert<IList<T>>(dateTimeFunc(dateTimeColumn));170 throw new InvalidOperationException("Invalid data column type.");171 }172 private static T Convert<T>(object obj) { return (T)obj; }173 174 175 105 public T GetCell<T>(int columnIndex, int rowIndex) { 176 return ColumnTypeSwitchFunc<T>(columnIndex,106 return dataColumns[columnIndex].TypeSwitch<T>( 177 107 c => c[rowIndex], 178 108 c => c[rowIndex], … … 188 118 InsertColumn<T>(i.ToString(), i); 189 119 190 ColumnTypeSwitchAction<T>(columnIndex,value,120 dataColumns[columnIndex].TypeSwitch<T>(value, 191 121 (c, v) => c[rowIndex] = v, 192 122 (c, v) => c[rowIndex] = v, … … 201 131 } 202 132 203 public IList<T> GetValues<T>(int columnIndex, bool considerSelection) { 204 if (considerSelection) { 205 var list = new List<T>(); 206 foreach (var rowIdx in selection[columnIndex]) { 207 list.Add(GetCell<T>(columnIndex, rowIdx)); 208 //list.Add((T)dataColumns[columnIndex][rowIdx]); 209 } 210 return list; 211 } else { 212 return ColumnTypeSwitchFuncList<T>(columnIndex, 213 c => c.Values.Select(x => x ?? double.NaN).ToList(), 214 c => c.Values, 215 c => c.Values.Select(x => x ?? DateTime.MinValue).ToList()); 216 //(IList<T>)dataColumns[columnIndex]; 217 } 218 } 219 220 public void SetValues<T>(int columnIndex, IList<T> values) { 133 public IEnumerable<T> GetValues<T>(int columnIndex, bool considerSelection) { 134 return dataColumns[columnIndex].TypeSwitch<T>( 135 c => c.GetValues(considerSelection ? selection[columnIndex] : null), 136 c => c.GetValues(considerSelection ? selection[columnIndex] : null), 137 c => c.GetValues(considerSelection ? selection[columnIndex] : null)); 138 } 139 140 public void SetValues<T>(int columnIndex, IEnumerable<T> values) { 221 141 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 222 142 if (VariableHasType<T>(columnIndex)) { … … 239 159 240 160 public bool SetValue(string value, int columnIndex, int rowIndex) { 241 bool valid = false; 242 if (VariableHasType<double>(columnIndex)) { 243 double val; 244 if (string.IsNullOrWhiteSpace(value)) { 245 val = double.NaN; 246 valid = true; 247 } else { 248 valid = double.TryParse(value, out val); 249 } 250 if (valid) 251 SetCell(columnIndex, rowIndex, val); 252 } else if (VariableHasType<string>(columnIndex)) { 253 valid = value != null; 254 if (valid) 255 SetCell(columnIndex, rowIndex, value); 256 } else if (VariableHasType<DateTime>(columnIndex)) { 257 DateTime date; 258 valid = DateTime.TryParse(value, out date); 259 if (valid) 260 SetCell(columnIndex, rowIndex, date); 261 } else { 262 throw new ArgumentException("column " + columnIndex + " contains a non supported type."); 263 } 161 var column = dataColumns[columnIndex]; 162 bool successful = column.SetValue(value, rowIndex); 264 163 265 164 if (!IsInTransaction) 266 165 OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 267 166 268 return valid;167 return successful; 269 168 } 270 169 … … 274 173 275 174 public int Rows { 276 get { return dataColumns. Count > 0 ? dataColumns[0].Length: 0; }175 get { return dataColumns.Any() ? dataColumns.Max(c => c.Length) : 0; } 277 176 } 278 177 #endregion … … 281 180 public void InsertRow(int rowIndex) { 282 181 SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 182 283 183 foreach (var column in dataColumns) { 284 ColumnTypeSwitchAction(column, 184 column.TypeSwitch( 185 c => c.Values.Insert(rowIndex, double.NaN), 285 186 c => c.Values.Insert(rowIndex, null), 286 c => c.Values.Insert(rowIndex, null), 287 c => c.Values.Insert(rowIndex, null)); 288 //var valueType = column.GetValueType(); 289 //column.Insert(rowIndex, valueType.IsValueType ? Activator.CreateInstance(valueType) : null); 290 } 187 c => c.Values.Insert(rowIndex, DateTime.MinValue)); 188 } 189 291 190 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 292 191 TrainingPartition.End++; … … 302 201 } 303 202 } 203 304 204 if (!IsInTransaction) 305 205 OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 306 206 } 207 307 208 public void DeleteRow(int rowIndex) { 308 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); 309 foreach (var column in dataColumns) { 310 ColumnTypeSwitchAction(column, 311 c => c.Values.RemoveAt(rowIndex), 312 c => c.Values.RemoveAt(rowIndex), 313 c => c.Values.RemoveAt(rowIndex)); 314 //column.RemoveAt(rowIndex); 315 } 316 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 317 TrainingPartition.End--; 318 if (TrainingPartition.End <= TestPartition.Start) { 319 TestPartition.Start--; 320 TestPartition.End--; 321 } 322 } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { 323 TestPartition.End--; 324 if (TestPartition.End <= TrainingPartition.Start) { 325 TestPartition.Start--; 326 TestPartition.End--; 327 } 328 } 329 if (!IsInTransaction) 330 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); 331 } 332 public void DeleteRowsWithIndices(IEnumerable<int> rows) { 209 DeleteRows(new[] { rowIndex }); 210 } 211 public void DeleteRows(IEnumerable<int> rowIndices) { 333 212 SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1); 334 foreach (int rowIndex in rows.OrderByDescending(x => x)) { 213 214 foreach (int rowIndex in rowIndices.OrderByDescending(x => x)) { 335 215 foreach (var column in dataColumns) { 336 ColumnTypeSwitchAction(column,216 column.TypeSwitch( 337 217 c => c.Values.RemoveAt(rowIndex), 338 218 c => c.Values.RemoveAt(rowIndex), 339 219 c => c.Values.RemoveAt(rowIndex)); 340 //column.RemoveAt(rowIndex);341 } 220 } 221 342 222 if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { 343 223 TrainingPartition.End--; … … 354 234 } 355 235 } 236 356 237 if (!IsInTransaction) 357 238 OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1); … … 362 243 363 244 if (typeof(T) == typeof(double)) { 364 dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double ?>(null, Rows)));245 dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat<double>(double.NaN, Rows))); 365 246 } else if (typeof(T) == typeof(string)) { 366 dataColumns. Add(new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(null, Rows)));247 dataColumns.Insert(columnIndex, new StringPreprocessingDataColumn(variableName, Enumerable.Repeat<string>(string.Empty, Rows))); 367 248 } else if (typeof(T) == typeof(DateTime)) { 368 dataColumns. Add(new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime?>(null, Rows)));249 dataColumns.Insert(columnIndex, new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat<DateTime>(DateTime.MinValue, Rows))); 369 250 } else { 370 251 throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime"); 371 252 } 372 253 373 //dataColumns.Insert(columnIndex, new List<T>(Enumerable.Repeat(default(T), Rows)));374 //variableNames.Insert(columnIndex, variableName);375 254 if (!IsInTransaction) 376 255 OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); … … 379 258 public void DeleteColumn(int columnIndex) { 380 259 SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); 260 381 261 dataColumns.RemoveAt(columnIndex); 382 //variableNames.RemoveAt(columnIndex); 262 383 263 if (!IsInTransaction) 384 264 OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); … … 386 266 387 267 public void RenameColumn(int columnIndex, string name) { 388 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1);389 268 if (columnIndex < 0 || columnIndex > dataColumns.Count) 390 269 throw new ArgumentOutOfRangeException("columnIndex"); 270 271 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); 272 391 273 dataColumns[columnIndex].Name = name; 392 274 … … 400 282 401 283 SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); 284 402 285 for (int i = 0; i < names.Count; i++) 403 286 dataColumns[i].Name = names[i]; … … 408 291 409 292 public bool AreAllStringColumns(IEnumerable<int> columnIndices) { 410 return columnIndices.All( x => VariableHasType<string>(x));293 return columnIndices.All(VariableHasType<string>); 411 294 } 412 295 #endregion … … 522 405 var stringColumn = dataColumns[i] as StringPreprocessingDataColumn; 523 406 var dateTimeColumn = dataColumns[i] as DateTimePreprocessingDataColumn; 524 if (doubleColumn != null) values.Add(new List<double>(doubleColumn. Values.Select(x => x ?? double.NaN)));525 else if (stringColumn != null) values.Add(new List<string>(stringColumn. Values));526 else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn. Values.Select(x => x ?? DateTime.MinValue)));407 if (doubleColumn != null) values.Add(new List<double>(doubleColumn.GetValues())); 408 else if (stringColumn != null) values.Add(new List<string>(stringColumn.GetValues())); 409 else if (dateTimeColumn != null) values.Add(new List<DateTime>(dateTimeColumn.GetValues())); 527 410 else throw new InvalidOperationException("Column type not supported for export"); 528 411 } … … 638 521 #endregion 639 522 640 #region Statistics 641 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 642 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 643 return values.Any() ? values.Min() : emptyValue; 644 } 645 646 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 647 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 648 return values.Any() ? values.Max() : emptyValue; 649 } 650 651 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 652 if (typeof(T) == typeof(double)) { 653 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 654 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 655 } 656 if (typeof(T) == typeof(string)) { 657 return Convert<T>(string.Empty); 658 } 659 if (typeof(T) == typeof(DateTime)) { 660 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 661 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 662 } 663 664 throw new InvalidOperationException(typeof(T) + " not supported"); 665 } 666 667 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 668 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 669 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 670 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 671 } 672 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 673 return values.Any() ? values.Quantile(0.5) : emptyValue; 674 } 675 676 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 677 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 678 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 679 } 680 681 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 682 if (typeof(T) == typeof(double)) { 683 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 684 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 685 } 686 // For DateTime, std.dev / variance would have to be TimeSpan 687 //if (typeof(T) == typeof(DateTime)) { 688 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 689 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 690 //} 691 return default(T); 692 } 693 694 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 695 if (typeof(T) == typeof(double)) { 696 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 697 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 698 } 699 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 700 //if (typeof(T) == typeof(DateTime)) { 701 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 702 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 703 //} 704 return default(T); 705 } 706 707 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 708 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 709 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 710 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 711 } 712 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 713 return values.Any() ? values.Quantile(alpha) : emptyValue; 714 } 715 716 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 717 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 718 return values.GroupBy(x => x).Count(); 719 } 720 721 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 722 //var doubleColumn = dataColumns[columnIndex] as DoublePreprocessingDataColumn; 723 //var stringColumn = dataColumns[columnIndex] as StringPreprocessingDataColumn; 724 //var dateTimeColumn = dataColumns[columnIndex] as DateTimePreprocessingDataColumn; 725 //return GetValues<T>(columnIndex, considerSelection).Where(x => 726 // doubleColumn != null ? doubleColumn.IsValidValue(Convert<double>(x)) 727 // : stringColumn != null ? stringColumn.IsValidValue(Convert<string>(x)) 728 // : dateTimeColumn != null ? dateTimeColumn.IsValidValue(Convert<DateTime>(x)) 729 // : false); 730 //!IsMissingValue(x)); 731 732 return GetValues<T>(columnIndex, considerSelection).Where(x => 733 ColumnTypeSwitchFuncResult<T, bool>(columnIndex, x, 734 (c, v) => v.HasValue && c.IsValidValue(v.Value), 735 (c, v) => c.IsValidValue(v), 736 (c, v) => v.HasValue && c.IsValidValue(v.Value) 737 )); 738 } 739 740 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 741 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 742 } 743 744 public int GetMissingValueCount() { 745 int count = 0; 746 for (int i = 0; i < Columns; ++i) { 747 count += GetMissingValueCount(i); 748 } 749 return count; 750 } 751 public int GetMissingValueCount(int columnIndex) { 752 int sum = 0; 753 for (int i = 0; i < Rows; i++) { 754 if (IsCellEmpty(columnIndex, i)) 755 sum++; 756 } 757 return sum; 758 } 759 public int GetRowMissingValueCount(int rowIndex) { 760 int sum = 0; 761 for (int i = 0; i < Columns; i++) { 762 if (IsCellEmpty(i, rowIndex)) 763 sum++; 764 } 765 return sum; 766 } 767 #endregion 768 769 #region Helpers 770 private static IList<IList> CopyVariableValues(IList<IList> original) { 771 var copy = new List<IList>(original); 772 for (int i = 0; i < original.Count; ++i) { 773 copy[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]); 774 } 775 return copy; 776 } 777 #endregion 523 /* #region Statistics 524 public T GetMin<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 525 try { 526 return dataColumns[columnIndex].TypeSwitch<T>( 527 col => col.GetMin(considerSelection ? Selection[columnIndex] : null), 528 col => col.GetMin(considerSelection ? Selection[columnIndex] : null), 529 col => col.GetMin(considerSelection ? Selection[columnIndex] : null)); 530 } catch (InvalidOperationException) { 531 return emptyValue; 532 } 533 } 534 535 public T GetMax<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 536 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 537 return values.Any() ? values.Max() : emptyValue; 538 } 539 540 public T GetMean<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 541 return 542 543 544 if (typeof(T) == typeof(double)) { 545 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 546 return values.Any() ? Convert<T>(values.Average()) : emptyValue; 547 } 548 if (typeof(T) == typeof(string)) { 549 return Convert<T>(string.Empty); 550 } 551 if (typeof(T) == typeof(DateTime)) { 552 var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 553 return values.Any() ? Convert<T>(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; 554 } 555 556 throw new InvalidOperationException(typeof(T) + " not supported"); 557 } 558 559 public T GetMedian<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 560 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 561 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 562 return doubleValues.Any() ? Convert<T>(doubleValues.Median()) : emptyValue; 563 } 564 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 565 return values.Any() ? values.Quantile(0.5) : emptyValue; 566 } 567 568 public T GetMode<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable<T> { 569 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 570 return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; 571 } 572 573 public T GetStandardDeviation<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 574 if (typeof(T) == typeof(double)) { 575 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 576 return values.Any() ? Convert<T>(values.StandardDeviation()) : emptyValue; 577 } 578 // For DateTime, std.dev / variance would have to be TimeSpan 579 //if (typeof(T) == typeof(DateTime)) { 580 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 581 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; 582 //} 583 return default(T); 584 } 585 586 public T GetVariance<T>(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { 587 if (typeof(T) == typeof(double)) { 588 var values = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 589 return values.Any() ? Convert<T>(values.Variance()) : emptyValue; 590 } 591 // DateTime variance often overflows long, thus the corresponding DateTime is invalid 592 //if (typeof(T) == typeof(DateTime)) { 593 // var values = GetValuesWithoutMissingValues<DateTime>(columnIndex, considerSelection); 594 // return values.Any() ? Convert<T>(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; 595 //} 596 return default(T); 597 } 598 599 public T GetQuantile<T>(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable<T> { 600 if (typeof(T) == typeof(double)) {// IEnumerable<double> is faster 601 var doubleValues = GetValuesWithoutMissingValues<double>(columnIndex, considerSelection); 602 return doubleValues.Any() ? Convert<T>(doubleValues.Quantile(alpha)) : emptyValue; 603 } 604 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 605 return values.Any() ? values.Quantile(alpha) : emptyValue; 606 } 607 608 public int GetDistinctValues<T>(int columnIndex, bool considerSelection = false) { 609 var values = GetValuesWithoutMissingValues<T>(columnIndex, considerSelection); 610 return values.GroupBy(x => x).Count(); 611 } 612 613 private IEnumerable<T> GetValuesWithoutMissingValues<T>(int columnIndex, bool considerSelection) { 614 return GetValues<T>(columnIndex, considerSelection).Where(x => 615 ColumnTypeSwitch<T, bool>(dataColumns[columnIndex], x, 616 (c, v) => c.IsValidValue(v), 617 (c, v) => c.IsValidValue(v), 618 (c, v) => c.IsValidValue(v) 619 )); 620 } 621 622 private static DateTime AggregateAsDouble(IEnumerable<DateTime> values, Func<IEnumerable<double>, double> func) { 623 return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); 624 } 625 626 public int GetMissingValueCount() { 627 int count = 0; 628 for (int i = 0; i < Columns; ++i) { 629 count += GetMissingValueCount(i); 630 } 631 return count; 632 } 633 public int GetMissingValueCount(int columnIndex) { 634 int sum = 0; 635 for (int i = 0; i < Rows; i++) { 636 if (IsCellEmpty(columnIndex, i)) 637 sum++; 638 } 639 return sum; 640 } 641 public int GetRowMissingValueCount(int rowIndex) { 642 int sum = 0; 643 for (int i = 0; i < Columns; i++) { 644 if (IsCellEmpty(i, rowIndex)) 645 sum++; 646 } 647 return sum; 648 } 649 #endregion */ 778 650 } 779 651 -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/PreprocessingTransformator.cs
r15270 r15309 83 83 int colIndex = preprocessingData.GetColumnIndex(transformation.Column); 84 84 var originalData = preprocessingData.GetValues<double>(colIndex); 85 originalColumns.Add(transformation.Column, originalData );85 originalColumns.Add(transformation.Column, originalData.ToList()); 86 86 } 87 87 } … … 107 107 } 108 108 109 private IEnumerable<double> ApplyDoubleTransformation(Transformation<double> transformation, I List<double> data, out bool success, out string errorMsg) {109 private IEnumerable<double> ApplyDoubleTransformation(Transformation<double> transformation, IEnumerable<double> data, out bool success, out string errorMsg) { 110 110 success = transformation.Check(data, out errorMsg); 111 111 // don't apply when the check fails -
branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/ProblemDataCreator.cs
r15110 r15309 129 129 } 130 130 131 private bool IsNotConstantInputVariable(I List<double> list) {131 private bool IsNotConstantInputVariable(IEnumerable<double> list) { 132 132 return context.Data.TrainingPartition.End - context.Data.TrainingPartition.Start > 1 || list.Range() > 0; 133 133 }
Note: See TracChangeset
for help on using the changeset viewer.