Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2695_dataset-ids/HeuristicLab.DataPreprocessing/3.4/Logic/ManipulationLogic.cs @ 17399

Last change on this file since 17399 was 15110, checked in by pfleck, 8 years ago

#2709: merged branch to trunk

File size: 14.4 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Data;
26using HeuristicLab.Random;
27
28namespace HeuristicLab.DataPreprocessing {
29  public class ManipulationLogic {
30    private readonly ITransactionalPreprocessingData preprocessingData;
31    private readonly StatisticsLogic statisticsLogic;
32    private readonly SearchLogic searchLogic;
33
34    public IEnumerable<string> VariableNames {
35      get { return preprocessingData.VariableNames; }
36    }
37
38    public ITransactionalPreprocessingData PreProcessingData {
39      get { return preprocessingData; }
40    }
41
42    public ManipulationLogic(ITransactionalPreprocessingData preprocessingData, SearchLogic theSearchLogic, StatisticsLogic theStatisticsLogic) {
43      this.preprocessingData = preprocessingData;
44      searchLogic = theSearchLogic;
45      statisticsLogic = theStatisticsLogic;
46    }
47
48    public void ReplaceIndicesByValue<T>(int columnIndex, IEnumerable<int> rowIndices, T value) {
49      foreach (int index in rowIndices) {
50        preprocessingData.SetCell<T>(columnIndex, index, value);
51      }
52    }
53
54    public void ReplaceIndicesByAverageValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
55      preprocessingData.InTransaction(() => {
56        foreach (var column in cells) {
57          if (preprocessingData.VariableHasType<double>(column.Key)) {
58            double average = statisticsLogic.GetAverage(column.Key, considerSelection);
59            ReplaceIndicesByValue<double>(column.Key, column.Value, average);
60          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
61            DateTime average = statisticsLogic.GetAverageDateTime(column.Key, considerSelection);
62            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, average);
63          }
64        }
65      });
66    }
67
68    public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
69      preprocessingData.InTransaction(() => {
70        foreach (var column in cells) {
71          if (preprocessingData.VariableHasType<double>(column.Key)) {
72            double median = statisticsLogic.GetMedian(column.Key, considerSelection);
73            ReplaceIndicesByValue<double>(column.Key, column.Value, median);
74          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
75            DateTime median = statisticsLogic.GetMedianDateTime(column.Key, considerSelection);
76            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, median);
77          }
78        }
79      });
80    }
81
82    public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
83      preprocessingData.InTransaction(() => {
84        System.Random r = new System.Random();
85
86        foreach (var column in cells) {
87          if (preprocessingData.VariableHasType<double>(column.Key)) {
88            double max = statisticsLogic.GetMax<double>(column.Key, double.NaN, considerSelection);
89            double min = statisticsLogic.GetMin<double>(column.Key, double.NaN, considerSelection);
90            double randMultiplier = (max - min);
91            foreach (int index in column.Value) {
92              double rand = r.NextDouble() * randMultiplier + min;
93              preprocessingData.SetCell<double>(column.Key, index, rand);
94            }
95          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
96            DateTime min = statisticsLogic.GetMin<DateTime>(column.Key, DateTime.MinValue, considerSelection);
97            DateTime max = statisticsLogic.GetMax<DateTime>(column.Key, DateTime.MinValue, considerSelection);
98            double randMultiplier = (max - min).TotalSeconds;
99            foreach (int index in column.Value) {
100              double rand = r.NextDouble() * randMultiplier;
101              preprocessingData.SetCell<DateTime>(column.Key, index, min.AddSeconds(rand));
102            }
103          }
104        }
105      });
106    }
107
108    public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary<int, IList<int>> cells) {
109      preprocessingData.InTransaction(() => {
110        foreach (var column in cells) {
111          IList<Tuple<int, int>> startEndings = GetStartAndEndingsForInterpolation(column);
112          foreach (var tuple in startEndings) {
113            Interpolate(column, tuple.Item1, tuple.Item2);
114          }
115        }
116      });
117    }
118
119    private List<Tuple<int, int>> GetStartAndEndingsForInterpolation(KeyValuePair<int, IList<int>> column) {
120      List<Tuple<int, int>> startEndings = new List<Tuple<int, int>>();
121      var rowIndices = column.Value;
122      rowIndices = rowIndices.OrderBy(x => x).ToList();
123      var count = rowIndices.Count;
124      int start = int.MinValue;
125      for (int i = 0; i < count; ++i) {
126        if (start == int.MinValue) {
127          start = indexOfPrevPresentValue(column.Key, rowIndices[i]);
128        }
129        if (i + 1 == count || (i + 1 < count && rowIndices[i + 1] - rowIndices[i] > 1)) {
130          int next = indexOfNextPresentValue(column.Key, rowIndices[i]);
131          if (start > 0 && next < preprocessingData.Rows) {
132            startEndings.Add(new Tuple<int, int>(start, next));
133          }
134          start = int.MinValue;
135        }
136      }
137      return startEndings;
138    }
139
140    public void ReplaceIndicesBySmoothing(IDictionary<int, IList<int>> cells) {
141      preprocessingData.InTransaction(() => {
142        foreach (var column in cells) {
143          int countValues = preprocessingData.Rows;
144
145          foreach (int index in column.Value) {
146            // dont replace first or last values
147            if (index > 0 && index < countValues) {
148              int prevIndex = indexOfPrevPresentValue(column.Key, index);
149              int nextIndex = indexOfNextPresentValue(column.Key, index);
150
151              // no neighbours found
152              if (prevIndex < 0 || nextIndex >= countValues) {
153                continue;
154              }
155
156              Interpolate(column, prevIndex, nextIndex);
157            }
158          }
159        }
160      });
161    }
162
163    private void Interpolate(KeyValuePair<int, IList<int>> column, int prevIndex, int nextIndex) {
164      int valuesToInterpolate = nextIndex - prevIndex;
165
166      if (preprocessingData.VariableHasType<double>(column.Key)) {
167        double prev = preprocessingData.GetCell<double>(column.Key, prevIndex);
168        double next = preprocessingData.GetCell<double>(column.Key, nextIndex);
169        double interpolationStep = (next - prev) / valuesToInterpolate;
170
171        for (int i = prevIndex; i < nextIndex; ++i) {
172          double interpolated = prev + (interpolationStep * (i - prevIndex));
173          preprocessingData.SetCell<double>(column.Key, i, interpolated);
174        }
175      } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
176        DateTime prev = preprocessingData.GetCell<DateTime>(column.Key, prevIndex);
177        DateTime next = preprocessingData.GetCell<DateTime>(column.Key, nextIndex);
178        double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
179
180        for (int i = prevIndex; i < nextIndex; ++i) {
181          DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
182          preprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
183        }
184      }
185    }
186
187    private int indexOfPrevPresentValue(int columnIndex, int start) {
188      int offset = start - 1;
189      while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) {
190        offset--;
191      }
192
193      return offset;
194    }
195
196    private int indexOfNextPresentValue(int columnIndex, int start) {
197      int offset = start + 1;
198      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) {
199        offset++;
200      }
201
202      return offset;
203    }
204
205    public void ReplaceIndicesByMostCommonValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
206      preprocessingData.InTransaction(() => {
207        foreach (var column in cells) {
208          if (preprocessingData.VariableHasType<double>(column.Key)) {
209            ReplaceIndicesByValue<double>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<double>(column.Key, double.NaN, considerSelection));
210          } else if (preprocessingData.VariableHasType<string>(column.Key)) {
211            ReplaceIndicesByValue<string>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<string>(column.Key, string.Empty, considerSelection));
212          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
213            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<DateTime>(column.Key, DateTime.MinValue, considerSelection));
214          } else {
215            throw new ArgumentException("column with index: " + column.Key + " contains a non supported type.");
216          }
217        }
218      });
219    }
220
221    public void Shuffle(bool shuffleRangesSeparately) {
222      var random = new FastRandom();
223
224      if (shuffleRangesSeparately) {
225        var ranges = new[] { preprocessingData.TestPartition, preprocessingData.TrainingPartition };
226        preprocessingData.InTransaction(() => {
227          // process all given ranges - e.g. TrainingPartition, TestPartition
228          foreach (IntRange range in ranges) {
229            var indices = Enumerable.Range(0, preprocessingData.Rows).ToArray();
230            var shuffledIndices = Enumerable.Range(range.Start, range.Size).Shuffle(random).ToArray();
231            for (int i = range.Start, j = 0; i < range.End; i++, j++)
232              indices[i] = shuffledIndices[j];
233
234            ReOrderToIndices(indices);
235          }
236        });
237
238      } else {
239        preprocessingData.InTransaction(() => {
240          var indices = Enumerable.Range(0, preprocessingData.Rows);
241          var shuffledIndices = indices.Shuffle(random).ToArray();
242          ReOrderToIndices(shuffledIndices);
243        });
244      }
245    }
246
247    public void ReOrderToIndices(int[] indices) {
248      preprocessingData.InTransaction(() => {
249        for (int i = 0; i < preprocessingData.Columns; ++i) {
250          if (preprocessingData.VariableHasType<double>(i)) {
251            ReOrderToIndices<double>(i, indices);
252          } else if (preprocessingData.VariableHasType<string>(i)) {
253            ReOrderToIndices<string>(i, indices);
254          } else if (preprocessingData.VariableHasType<DateTime>(i)) {
255            ReOrderToIndices<DateTime>(i, indices);
256          }
257        }
258      });
259    }
260
261    private void ReOrderToIndices<T>(int columnIndex, int[] indices) {
262      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(columnIndex));
263      if (indices.Length != originalData.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
264
265      for (int i = 0; i < indices.Length; i++) {
266        int originalIndex = i;
267        int replaceIndex = indices[i];
268
269        T replaceValue = originalData.ElementAt<T>(replaceIndex);
270        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
271      }
272    }
273
274    public void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, string value) {
275      preprocessingData.InTransaction(() => {
276        foreach (var column in cells) {
277          foreach (var rowIdx in column.Value) {
278            preprocessingData.SetValue(value, column.Key, rowIdx);
279          }
280        }
281      });
282    }
283
284
285    public List<int> RowsWithMissingValuesGreater(double percent) {
286      List<int> rows = new List<int>();
287
288      for (int i = 0; i < preprocessingData.Rows; ++i) {
289        int missingCount = statisticsLogic.GetRowMissingValueCount(i);
290        if (100f / preprocessingData.Columns * missingCount > percent) {
291          rows.Add(i);
292        }
293      }
294
295      return rows;
296    }
297
298    public List<int> ColumnsWithMissingValuesGreater(double percent) {
299      List<int> columns = new List<int>();
300      for (int i = 0; i < preprocessingData.Columns; ++i) {
301        int missingCount = statisticsLogic.GetMissingValueCount(i);
302        if (100f / preprocessingData.Rows * missingCount > percent) {
303          columns.Add(i);
304        }
305      }
306
307      return columns;
308    }
309
310    public List<int> ColumnsWithVarianceSmaller(double variance) {
311      List<int> columns = new List<int>();
312      for (int i = 0; i < preprocessingData.Columns; ++i) {
313        if (preprocessingData.VariableHasType<double>(i) || preprocessingData.VariableHasType<DateTime>(i)) {
314          double columnVariance = statisticsLogic.GetVariance(i);
315          if (columnVariance < variance) {
316            columns.Add(i);
317          }
318        }
319      }
320      return columns;
321    }
322
323    public void DeleteRowsWithMissingValuesGreater(double percent) {
324      DeleteRows(RowsWithMissingValuesGreater(percent));
325    }
326
327    public void DeleteColumnsWithMissingValuesGreater(double percent) {
328      DeleteColumns(ColumnsWithMissingValuesGreater(percent));
329    }
330
331    public void DeleteColumnsWithVarianceSmaller(double variance) {
332      DeleteColumns(ColumnsWithVarianceSmaller(variance));
333    }
334
335    private void DeleteRows(List<int> rows) {
336      rows.Sort();
337      rows.Reverse();
338      preprocessingData.InTransaction(() => {
339        foreach (int row in rows) {
340          preprocessingData.DeleteRow(row);
341        }
342      });
343    }
344
345    private void DeleteColumns(List<int> columns) {
346      columns.Sort();
347      columns.Reverse();
348      preprocessingData.InTransaction(() => {
349        foreach (int column in columns) {
350          preprocessingData.DeleteColumn(column);
351        }
352      });
353    }
354  }
355}
Note: See TracBrowser for help on using the repository browser.