source: branches/DataPreprocessing Enhancements/HeuristicLab.DataPreprocessing/3.4/Logic/ManipulationLogic.cs @ 14996

Last change on this file since 14996 was 14996, checked in by pfleck, 5 years ago

#2709

  • Fixed initial selection of the grouping text box (empty string instead of null to select the first entry).
  • General code fixes (removed unnessecary bank lines and code, class member order, ...)
File size: 15.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Data;
26
27namespace HeuristicLab.DataPreprocessing {
28  public class ManipulationLogic {
29    private readonly ITransactionalPreprocessingData preprocessingData;
30    private readonly StatisticsLogic statisticsLogic;
31    private readonly SearchLogic searchLogic;
32
33    public IEnumerable<string> VariableNames {
34      get { return preprocessingData.VariableNames; }
35    }
36
37    public ITransactionalPreprocessingData PreProcessingData {
38      get { return preprocessingData; }
39    }
40
41    public ManipulationLogic(ITransactionalPreprocessingData preprocessingData, SearchLogic theSearchLogic, StatisticsLogic theStatisticsLogic) {
42      this.preprocessingData = preprocessingData;
43      searchLogic = theSearchLogic;
44      statisticsLogic = theStatisticsLogic;
45    }
46
47    public void ReplaceIndicesByValue<T>(int columnIndex, IEnumerable<int> rowIndices, T value) {
48      foreach (int index in rowIndices) {
49        preprocessingData.SetCell<T>(columnIndex, index, value);
50      }
51    }
52
53    public void ReplaceIndicesByAverageValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
54      preprocessingData.InTransaction(() => {
55        foreach (var column in cells) {
56          if (preprocessingData.VariableHasType<double>(column.Key)) {
57            double average = statisticsLogic.GetAverage(column.Key, considerSelection);
58            ReplaceIndicesByValue<double>(column.Key, column.Value, average);
59          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
60            DateTime average = statisticsLogic.GetAverageDateTime(column.Key, considerSelection);
61            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, average);
62          }
63        }
64      });
65    }
66
67    public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
68      preprocessingData.InTransaction(() => {
69        foreach (var column in cells) {
70          if (preprocessingData.VariableHasType<double>(column.Key)) {
71            double median = statisticsLogic.GetMedian(column.Key, considerSelection);
72            ReplaceIndicesByValue<double>(column.Key, column.Value, median);
73          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
74            DateTime median = statisticsLogic.GetMedianDateTime(column.Key, considerSelection);
75            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, median);
76          }
77        }
78      });
79    }
80
81    public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
82      preprocessingData.InTransaction(() => {
83        Random r = new Random();
84
85        foreach (var column in cells) {
86          if (preprocessingData.VariableHasType<double>(column.Key)) {
87            double max = statisticsLogic.GetMax<double>(column.Key, double.NaN, considerSelection);
88            double min = statisticsLogic.GetMin<double>(column.Key, double.NaN, considerSelection);
89            double randMultiplier = (max - min);
90            foreach (int index in column.Value) {
91              double rand = r.NextDouble() * randMultiplier + min;
92              preprocessingData.SetCell<double>(column.Key, index, rand);
93            }
94          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
95            DateTime min = statisticsLogic.GetMin<DateTime>(column.Key, DateTime.MinValue, considerSelection);
96            DateTime max = statisticsLogic.GetMax<DateTime>(column.Key, DateTime.MinValue, considerSelection);
97            double randMultiplier = (max - min).TotalSeconds;
98            foreach (int index in column.Value) {
99              double rand = r.NextDouble() * randMultiplier;
100              preprocessingData.SetCell<DateTime>(column.Key, index, min.AddSeconds(rand));
101            }
102          }
103        }
104      });
105    }
106
107    public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary<int, IList<int>> cells) {
108      preprocessingData.InTransaction(() => {
109        foreach (var column in cells) {
110          IList<Tuple<int, int>> startEndings = GetStartAndEndingsForInterpolation(column);
111          foreach (var tuple in startEndings) {
112            Interpolate(column, tuple.Item1, tuple.Item2);
113          }
114        }
115      });
116    }
117
118    private List<Tuple<int, int>> GetStartAndEndingsForInterpolation(KeyValuePair<int, IList<int>> column) {
119      List<Tuple<int, int>> startEndings = new List<Tuple<int, int>>();
120      var rowIndices = column.Value;
121      rowIndices = rowIndices.OrderBy(x => x).ToList();
122      var count = rowIndices.Count;
123      int start = int.MinValue;
124      for (int i = 0; i < count; ++i) {
125        if (start == int.MinValue) {
126          start = indexOfPrevPresentValue(column.Key, rowIndices[i]);
127        }
128        if (i + 1 == count || (i + 1 < count && rowIndices[i + 1] - rowIndices[i] > 1)) {
129          int next = indexOfNextPresentValue(column.Key, rowIndices[i]);
130          if (start > 0 && next < preprocessingData.Rows) {
131            startEndings.Add(new Tuple<int, int>(start, next));
132          }
133          start = int.MinValue;
134        }
135      }
136      return startEndings;
137    }
138
139    public void ReplaceIndicesBySmoothing(IDictionary<int, IList<int>> cells) {
140      preprocessingData.InTransaction(() => {
141        foreach (var column in cells) {
142          int countValues = preprocessingData.Rows;
143
144          foreach (int index in column.Value) {
145            // dont replace first or last values
146            if (index > 0 && index < countValues) {
147              int prevIndex = indexOfPrevPresentValue(column.Key, index);
148              int nextIndex = indexOfNextPresentValue(column.Key, index);
149
150              // no neighbours found
151              if (prevIndex < 0 || nextIndex >= countValues) {
152                continue;
153              }
154
155              Interpolate(column, prevIndex, nextIndex);
156            }
157          }
158        }
159      });
160    }
161
162    private void Interpolate(KeyValuePair<int, IList<int>> column, int prevIndex, int nextIndex) {
163      int valuesToInterpolate = nextIndex - prevIndex;
164
165      if (preprocessingData.VariableHasType<double>(column.Key)) {
166        double prev = preprocessingData.GetCell<double>(column.Key, prevIndex);
167        double next = preprocessingData.GetCell<double>(column.Key, nextIndex);
168        double interpolationStep = (next - prev) / valuesToInterpolate;
169
170        for (int i = prevIndex; i < nextIndex; ++i) {
171          double interpolated = prev + (interpolationStep * (i - prevIndex));
172          preprocessingData.SetCell<double>(column.Key, i, interpolated);
173        }
174      } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
175        DateTime prev = preprocessingData.GetCell<DateTime>(column.Key, prevIndex);
176        DateTime next = preprocessingData.GetCell<DateTime>(column.Key, nextIndex);
177        double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
178
179        for (int i = prevIndex; i < nextIndex; ++i) {
180          DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
181          preprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
182        }
183      }
184    }
185
186    private int indexOfPrevPresentValue(int columnIndex, int start) {
187      int offset = start - 1;
188      while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) {
189        offset--;
190      }
191
192      return offset;
193    }
194
195    private int indexOfNextPresentValue(int columnIndex, int start) {
196      int offset = start + 1;
197      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) {
198        offset++;
199      }
200
201      return offset;
202    }
203
204    public void ReplaceIndicesByMostCommonValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
205      preprocessingData.InTransaction(() => {
206        foreach (var column in cells) {
207          if (preprocessingData.VariableHasType<double>(column.Key)) {
208            ReplaceIndicesByValue<double>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<double>(column.Key, double.NaN, considerSelection));
209          } else if (preprocessingData.VariableHasType<string>(column.Key)) {
210            ReplaceIndicesByValue<string>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<string>(column.Key, string.Empty, considerSelection));
211          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
212            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<DateTime>(column.Key, DateTime.MinValue, considerSelection));
213          } else {
214            throw new ArgumentException("column with index: " + column.Key + " contains a non supported type.");
215          }
216        }
217      });
218    }
219
220    public void Shuffle(bool shuffleRangesSeparately) {
221      Random random = new Random();
222      var ranges = new[] { preprocessingData.TestPartition, preprocessingData.TrainingPartition };
223      if (shuffleRangesSeparately) {
224        preprocessingData.InTransaction(() => {
225          // process all given ranges - e.g. TrainingPartition, TestPartition
226          foreach (IntRange range in ranges) {
227            List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>();
228
229            // generate random indices used for shuffeling each column
230            for (int i = range.End - 1; i >= range.Start; --i) {
231              int rand = random.Next(range.Start, i);
232              shuffledIndices.Add(new Tuple<int, int>(i, rand));
233            }
234
235            ShuffleToIndices(shuffledIndices);
236          }
237        });
238      } else {
239        preprocessingData.InTransaction(() => {
240          var indices = ranges.SelectMany(x => Enumerable.Range(x.Start, x.Size)).ToList();
241          var shuffledIndices = indices.OrderBy(x => random.Next());
242          ShuffleToIndices(indices.Zip(shuffledIndices, (i, j) => new Tuple<int, int>(i, j)).ToList());
243        });
244      }
245    }
246
247    public void ReOrderToIndices(IEnumerable<int> indices) {
248      List<Tuple<int, int>> indicesTuple = new List<Tuple<int, int>>();
249
250      for (int i = 0; i < indices.Count(); ++i) {
251        indicesTuple.Add(new Tuple<int, int>(i, indices.ElementAt(i)));
252      }
253
254      ReOrderToIndices(indicesTuple);
255    }
256
257    public void ReOrderToIndices(IList<System.Tuple<int, int>> indices) {
258      preprocessingData.InTransaction(() => {
259        for (int i = 0; i < preprocessingData.Columns; ++i) {
260          if (preprocessingData.VariableHasType<double>(i)) {
261            ReOrderToIndices<double>(i, indices);
262          } else if (preprocessingData.VariableHasType<string>(i)) {
263            ReOrderToIndices<string>(i, indices);
264          } else if (preprocessingData.VariableHasType<DateTime>(i)) {
265            ReOrderToIndices<DateTime>(i, indices);
266          }
267        }
268      });
269    }
270
271    public void ShuffleToIndices(IList<System.Tuple<int, int>> indices) {
272      preprocessingData.InTransaction(() => {
273        for (int i = 0; i < preprocessingData.Columns; ++i) {
274          if (preprocessingData.VariableHasType<double>(i)) {
275            ShuffleToIndices<double>(i, indices);
276          } else if (preprocessingData.VariableHasType<string>(i)) {
277            ShuffleToIndices<string>(i, indices);
278          } else if (preprocessingData.VariableHasType<DateTime>(i)) {
279            ShuffleToIndices<DateTime>(i, indices);
280          }
281        }
282      });
283    }
284
285    private void ReOrderToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices) {
286      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(columnIndex));
287
288      // process all columns equally
289      foreach (Tuple<int, int> index in indices) {
290        int originalIndex = index.Item1;
291        int replaceIndex = index.Item2;
292
293        T replaceValue = originalData.ElementAt<T>(replaceIndex);
294        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
295      }
296    }
297
298    private void ShuffleToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices) {
299      // process all columns equally
300      foreach (Tuple<int, int> index in indices) {
301        int originalIndex = index.Item1;
302        int replaceIndex = index.Item2;
303
304        T tmp = preprocessingData.GetCell<T>(columnIndex, originalIndex);
305        T replaceValue = preprocessingData.GetCell<T>(columnIndex, replaceIndex);
306
307        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
308        preprocessingData.SetCell<T>(columnIndex, replaceIndex, tmp);
309      }
310    }
311
312    public void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, string value) {
313      preprocessingData.InTransaction(() => {
314        foreach (var column in cells) {
315          foreach (var rowIdx in column.Value) {
316            preprocessingData.SetValue(value, column.Key, rowIdx);
317          }
318        }
319      });
320    }
321
322
323    public List<int> RowsWithMissingValuesGreater(double percent) {
324      List<int> rows = new List<int>();
325
326      for (int i = 0; i < preprocessingData.Rows; ++i) {
327        int missingCount = statisticsLogic.GetRowMissingValueCount(i);
328        if (100f / preprocessingData.Columns * missingCount > percent) {
329          rows.Add(i);
330        }
331      }
332
333      return rows;
334    }
335
336    public List<int> ColumnsWithMissingValuesGreater(double percent) {
337      List<int> columns = new List<int>();
338      for (int i = 0; i < preprocessingData.Columns; ++i) {
339        int missingCount = statisticsLogic.GetMissingValueCount(i);
340        if (100f / preprocessingData.Rows * missingCount > percent) {
341          columns.Add(i);
342        }
343      }
344
345      return columns;
346    }
347
348    public List<int> ColumnsWithVarianceSmaller(double variance) {
349      List<int> columns = new List<int>();
350      for (int i = 0; i < preprocessingData.Columns; ++i) {
351        if (preprocessingData.VariableHasType<double>(i) || preprocessingData.VariableHasType<DateTime>(i)) {
352          double columnVariance = statisticsLogic.GetVariance(i);
353          if (columnVariance < variance) {
354            columns.Add(i);
355          }
356        }
357      }
358      return columns;
359    }
360
361    public void DeleteRowsWithMissingValuesGreater(double percent) {
362      DeleteRows(RowsWithMissingValuesGreater(percent));
363    }
364
365    public void DeleteColumnsWithMissingValuesGreater(double percent) {
366      DeleteColumns(ColumnsWithMissingValuesGreater(percent));
367    }
368
369    public void DeleteColumnsWithVarianceSmaller(double variance) {
370      DeleteColumns(ColumnsWithVarianceSmaller(variance));
371    }
372
373    private void DeleteRows(List<int> rows) {
374      rows.Sort();
375      rows.Reverse();
376      preprocessingData.InTransaction(() => {
377        foreach (int row in rows) {
378          preprocessingData.DeleteRow(row);
379        }
380      });
381    }
382
383    private void DeleteColumns(List<int> columns) {
384      columns.Sort();
385      columns.Reverse();
386      preprocessingData.InTransaction(() => {
387        foreach (int column in columns) {
388          preprocessingData.DeleteColumn(column);
389        }
390      });
391    }
392  }
393}
Note: See TracBrowser for help on using the repository browser.