Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing Cleanup/HeuristicLab.DataPreprocessing/3.4/Logic/ManipulationLogic.cs @ 15269

Last change on this file since 15269 was 15269, checked in by pfleck, 7 years ago

#2809: Removed SearchLogic

File size: 14.3 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Data;
26using HeuristicLab.Random;
27
28namespace HeuristicLab.DataPreprocessing {
29  public class ManipulationLogic {
30    private readonly ITransactionalPreprocessingData preprocessingData;
31    private readonly StatisticsLogic statisticsLogic;
32
33    public IEnumerable<string> VariableNames {
34      get { return preprocessingData.VariableNames; }
35    }
36
37    public ITransactionalPreprocessingData PreProcessingData {
38      get { return preprocessingData; }
39    }
40
41    public ManipulationLogic(ITransactionalPreprocessingData preprocessingData, StatisticsLogic theStatisticsLogic) {
42      this.preprocessingData = preprocessingData;
43      statisticsLogic = theStatisticsLogic;
44    }
45
46    public void ReplaceIndicesByValue<T>(int columnIndex, IEnumerable<int> rowIndices, T value) {
47      foreach (int index in rowIndices) {
48        preprocessingData.SetCell<T>(columnIndex, index, value);
49      }
50    }
51
52    public void ReplaceIndicesByAverageValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
53      preprocessingData.InTransaction(() => {
54        foreach (var column in cells) {
55          if (preprocessingData.VariableHasType<double>(column.Key)) {
56            double average = statisticsLogic.GetAverage(column.Key, considerSelection);
57            ReplaceIndicesByValue<double>(column.Key, column.Value, average);
58          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
59            DateTime average = statisticsLogic.GetAverageDateTime(column.Key, considerSelection);
60            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, average);
61          }
62        }
63      });
64    }
65
66    public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
67      preprocessingData.InTransaction(() => {
68        foreach (var column in cells) {
69          if (preprocessingData.VariableHasType<double>(column.Key)) {
70            double median = statisticsLogic.GetMedian(column.Key, considerSelection);
71            ReplaceIndicesByValue<double>(column.Key, column.Value, median);
72          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
73            DateTime median = statisticsLogic.GetMedianDateTime(column.Key, considerSelection);
74            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, median);
75          }
76        }
77      });
78    }
79
80    public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
81      preprocessingData.InTransaction(() => {
82        System.Random r = new System.Random();
83
84        foreach (var column in cells) {
85          if (preprocessingData.VariableHasType<double>(column.Key)) {
86            double max = statisticsLogic.GetMax<double>(column.Key, double.NaN, considerSelection);
87            double min = statisticsLogic.GetMin<double>(column.Key, double.NaN, considerSelection);
88            double randMultiplier = (max - min);
89            foreach (int index in column.Value) {
90              double rand = r.NextDouble() * randMultiplier + min;
91              preprocessingData.SetCell<double>(column.Key, index, rand);
92            }
93          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
94            DateTime min = statisticsLogic.GetMin<DateTime>(column.Key, DateTime.MinValue, considerSelection);
95            DateTime max = statisticsLogic.GetMax<DateTime>(column.Key, DateTime.MinValue, considerSelection);
96            double randMultiplier = (max - min).TotalSeconds;
97            foreach (int index in column.Value) {
98              double rand = r.NextDouble() * randMultiplier;
99              preprocessingData.SetCell<DateTime>(column.Key, index, min.AddSeconds(rand));
100            }
101          }
102        }
103      });
104    }
105
106    public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary<int, IList<int>> cells) {
107      preprocessingData.InTransaction(() => {
108        foreach (var column in cells) {
109          IList<Tuple<int, int>> startEndings = GetStartAndEndingsForInterpolation(column);
110          foreach (var tuple in startEndings) {
111            Interpolate(column, tuple.Item1, tuple.Item2);
112          }
113        }
114      });
115    }
116
117    private List<Tuple<int, int>> GetStartAndEndingsForInterpolation(KeyValuePair<int, IList<int>> column) {
118      List<Tuple<int, int>> startEndings = new List<Tuple<int, int>>();
119      var rowIndices = column.Value;
120      rowIndices = rowIndices.OrderBy(x => x).ToList();
121      var count = rowIndices.Count;
122      int start = int.MinValue;
123      for (int i = 0; i < count; ++i) {
124        if (start == int.MinValue) {
125          start = indexOfPrevPresentValue(column.Key, rowIndices[i]);
126        }
127        if (i + 1 == count || (i + 1 < count && rowIndices[i + 1] - rowIndices[i] > 1)) {
128          int next = indexOfNextPresentValue(column.Key, rowIndices[i]);
129          if (start > 0 && next < preprocessingData.Rows) {
130            startEndings.Add(new Tuple<int, int>(start, next));
131          }
132          start = int.MinValue;
133        }
134      }
135      return startEndings;
136    }
137
138    public void ReplaceIndicesBySmoothing(IDictionary<int, IList<int>> cells) {
139      preprocessingData.InTransaction(() => {
140        foreach (var column in cells) {
141          int countValues = preprocessingData.Rows;
142
143          foreach (int index in column.Value) {
144            // dont replace first or last values
145            if (index > 0 && index < countValues) {
146              int prevIndex = indexOfPrevPresentValue(column.Key, index);
147              int nextIndex = indexOfNextPresentValue(column.Key, index);
148
149              // no neighbours found
150              if (prevIndex < 0 || nextIndex >= countValues) {
151                continue;
152              }
153
154              Interpolate(column, prevIndex, nextIndex);
155            }
156          }
157        }
158      });
159    }
160
161    private void Interpolate(KeyValuePair<int, IList<int>> column, int prevIndex, int nextIndex) {
162      int valuesToInterpolate = nextIndex - prevIndex;
163
164      if (preprocessingData.VariableHasType<double>(column.Key)) {
165        double prev = preprocessingData.GetCell<double>(column.Key, prevIndex);
166        double next = preprocessingData.GetCell<double>(column.Key, nextIndex);
167        double interpolationStep = (next - prev) / valuesToInterpolate;
168
169        for (int i = prevIndex; i < nextIndex; ++i) {
170          double interpolated = prev + (interpolationStep * (i - prevIndex));
171          preprocessingData.SetCell<double>(column.Key, i, interpolated);
172        }
173      } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
174        DateTime prev = preprocessingData.GetCell<DateTime>(column.Key, prevIndex);
175        DateTime next = preprocessingData.GetCell<DateTime>(column.Key, nextIndex);
176        double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
177
178        for (int i = prevIndex; i < nextIndex; ++i) {
179          DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
180          preprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
181        }
182      }
183    }
184
185    private int indexOfPrevPresentValue(int columnIndex, int start) {
186      int offset = start - 1;
187      while (offset >= 0 && preprocessingData.IsCellEmpty(columnIndex, offset)) {
188        offset--;
189      }
190
191      return offset;
192    }
193
194    private int indexOfNextPresentValue(int columnIndex, int start) {
195      int offset = start + 1;
196      while (offset < preprocessingData.Rows && preprocessingData.IsCellEmpty(columnIndex, offset)) {
197        offset++;
198      }
199
200      return offset;
201    }
202
203    public void ReplaceIndicesByMostCommonValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
204      preprocessingData.InTransaction(() => {
205        foreach (var column in cells) {
206          if (preprocessingData.VariableHasType<double>(column.Key)) {
207            ReplaceIndicesByValue<double>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<double>(column.Key, double.NaN, considerSelection));
208          } else if (preprocessingData.VariableHasType<string>(column.Key)) {
209            ReplaceIndicesByValue<string>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<string>(column.Key, string.Empty, considerSelection));
210          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
211            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<DateTime>(column.Key, DateTime.MinValue, considerSelection));
212          } else {
213            throw new ArgumentException("column with index: " + column.Key + " contains a non supported type.");
214          }
215        }
216      });
217    }
218
219    public void Shuffle(bool shuffleRangesSeparately) {
220      var random = new FastRandom();
221
222      if (shuffleRangesSeparately) {
223        var ranges = new[] { preprocessingData.TestPartition, preprocessingData.TrainingPartition };
224        preprocessingData.InTransaction(() => {
225          // process all given ranges - e.g. TrainingPartition, TestPartition
226          foreach (IntRange range in ranges) {
227            var indices = Enumerable.Range(0, preprocessingData.Rows).ToArray();
228            var shuffledIndices = Enumerable.Range(range.Start, range.Size).Shuffle(random).ToArray();
229            for (int i = range.Start, j = 0; i < range.End; i++, j++)
230              indices[i] = shuffledIndices[j];
231
232            ReOrderToIndices(indices);
233          }
234        });
235
236      } else {
237        preprocessingData.InTransaction(() => {
238          var indices = Enumerable.Range(0, preprocessingData.Rows);
239          var shuffledIndices = indices.Shuffle(random).ToArray();
240          ReOrderToIndices(shuffledIndices);
241        });
242      }
243    }
244
245    public void ReOrderToIndices(int[] indices) {
246      preprocessingData.InTransaction(() => {
247        for (int i = 0; i < preprocessingData.Columns; ++i) {
248          if (preprocessingData.VariableHasType<double>(i)) {
249            ReOrderToIndices<double>(i, indices);
250          } else if (preprocessingData.VariableHasType<string>(i)) {
251            ReOrderToIndices<string>(i, indices);
252          } else if (preprocessingData.VariableHasType<DateTime>(i)) {
253            ReOrderToIndices<DateTime>(i, indices);
254          }
255        }
256      });
257    }
258
259    private void ReOrderToIndices<T>(int columnIndex, int[] indices) {
260      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(columnIndex));
261      if (indices.Length != originalData.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
262
263      for (int i = 0; i < indices.Length; i++) {
264        int originalIndex = i;
265        int replaceIndex = indices[i];
266
267        T replaceValue = originalData.ElementAt<T>(replaceIndex);
268        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
269      }
270    }
271
272    public void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, string value) {
273      preprocessingData.InTransaction(() => {
274        foreach (var column in cells) {
275          foreach (var rowIdx in column.Value) {
276            preprocessingData.SetValue(value, column.Key, rowIdx);
277          }
278        }
279      });
280    }
281
282
283    public List<int> RowsWithMissingValuesGreater(double percent) {
284      List<int> rows = new List<int>();
285
286      for (int i = 0; i < preprocessingData.Rows; ++i) {
287        int missingCount = statisticsLogic.GetRowMissingValueCount(i);
288        if (100f / preprocessingData.Columns * missingCount > percent) {
289          rows.Add(i);
290        }
291      }
292
293      return rows;
294    }
295
296    public List<int> ColumnsWithMissingValuesGreater(double percent) {
297      List<int> columns = new List<int>();
298      for (int i = 0; i < preprocessingData.Columns; ++i) {
299        int missingCount = statisticsLogic.GetMissingValueCount(i);
300        if (100f / preprocessingData.Rows * missingCount > percent) {
301          columns.Add(i);
302        }
303      }
304
305      return columns;
306    }
307
308    public List<int> ColumnsWithVarianceSmaller(double variance) {
309      List<int> columns = new List<int>();
310      for (int i = 0; i < preprocessingData.Columns; ++i) {
311        if (preprocessingData.VariableHasType<double>(i) || preprocessingData.VariableHasType<DateTime>(i)) {
312          double columnVariance = statisticsLogic.GetVariance(i);
313          if (columnVariance < variance) {
314            columns.Add(i);
315          }
316        }
317      }
318      return columns;
319    }
320
321    public void DeleteRowsWithMissingValuesGreater(double percent) {
322      DeleteRows(RowsWithMissingValuesGreater(percent));
323    }
324
325    public void DeleteColumnsWithMissingValuesGreater(double percent) {
326      DeleteColumns(ColumnsWithMissingValuesGreater(percent));
327    }
328
329    public void DeleteColumnsWithVarianceSmaller(double variance) {
330      DeleteColumns(ColumnsWithVarianceSmaller(variance));
331    }
332
333    private void DeleteRows(List<int> rows) {
334      rows.Sort();
335      rows.Reverse();
336      preprocessingData.InTransaction(() => {
337        foreach (int row in rows) {
338          preprocessingData.DeleteRow(row);
339        }
340      });
341    }
342
343    private void DeleteColumns(List<int> columns) {
344      columns.Sort();
345      columns.Reverse();
346      preprocessingData.InTransaction(() => {
347        foreach (int column in columns) {
348          preprocessingData.DeleteColumn(column);
349        }
350      });
351    }
352  }
353}
Note: See TracBrowser for help on using the repository browser.