Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/ManipulationLogic.cs @ 10854

Last change on this file since 10854 was 10820, checked in by rstoll, 11 years ago
  • Interpolation implemented
  • Smoothing fixed
  • Interpolation/Smooting menu disabled if first column is selected as well
File size: 15.7 KB
RevLine 
[10539]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[10193]23using System.Collections.Generic;
24using System.Linq;
[10249]25using HeuristicLab.Data;
[10193]26
[10249]27namespace HeuristicLab.DataPreprocessing {
[10369]28  public class ManipulationLogic : IManipulationLogic {
[10586]29    private ITransactionalPreprocessingData preprocessingData;
[10615]30    private IStatisticsLogic statisticsLogic;
[10249]31    private ISearchLogic searchLogic;
[10672]32    private IDataGridLogic dataGridLogic;
[10193]33
[10672]34    public ManipulationLogic(ITransactionalPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic, IDataGridLogic theDataGridLogic) {
[10249]35      preprocessingData = _prepocessingData;
36      searchLogic = theSearchLogic;
[10615]37      statisticsLogic = theStatisticsLogic;
[10672]38      dataGridLogic = theDataGridLogic;
[10249]39    }
[10193]40
[10367]41    public void ReplaceIndicesByValue<T>(int columnIndex, IEnumerable<int> rowIndices, T value) {
42      foreach (int index in rowIndices) {
43        preprocessingData.SetCell<T>(columnIndex, index, value);
[10249]44      }
45    }
[10193]46
[10809]47    public void ReplaceIndicesByAverageValue(IDictionary<int, IList<int>> cells, bool considerSelection) {
[10612]48      preprocessingData.InTransaction(() => {
49        foreach (var column in cells) {
[10615]50          if (preprocessingData.IsType<double>(column.Key)) {
[10809]51            double average = statisticsLogic.GetAverage(column.Key, considerSelection);
[10615]52            ReplaceIndicesByValue<double>(column.Key, column.Value, average);
53          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
[10809]54            DateTime average = statisticsLogic.GetAverageDateTime(column.Key, considerSelection);
[10615]55            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, average);
56          }
[10612]57        }
58      });
[10249]59    }
[10193]60
[10809]61    public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection) {
[10612]62      preprocessingData.InTransaction(() => {
63        foreach (var column in cells) {
[10615]64          if (preprocessingData.IsType<double>(column.Key)) {
[10809]65            double median = statisticsLogic.GetMedian(column.Key, considerSelection);
[10615]66            ReplaceIndicesByValue<double>(column.Key, column.Value, median);
67          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
[10809]68            DateTime median = statisticsLogic.GetMedianDateTime(column.Key, considerSelection);
[10615]69            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, median);
70          }
[10612]71        }
72      });
[10249]73    }
[10193]74
[10809]75    public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection) {
[10612]76      preprocessingData.InTransaction(() => {
77        Random r = new Random();
[10193]78
[10612]79        foreach (var column in cells) {
[10615]80          if (preprocessingData.IsType<double>(column.Key)) {
[10809]81            double max = statisticsLogic.GetMax<double>(column.Key, considerSelection);
82            double min = statisticsLogic.GetMin<double>(column.Key, considerSelection);
[10615]83            double randMultiplier = (max - min);
84            foreach (int index in column.Value) {
85              double rand = r.NextDouble() * randMultiplier + min;
86              preprocessingData.SetCell<double>(column.Key, index, rand);
87            }
88          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
[10809]89            DateTime min = statisticsLogic.GetMin<DateTime>(column.Key, considerSelection);
90            DateTime max = statisticsLogic.GetMax<DateTime>(column.Key, considerSelection);
[10615]91            double randMultiplier = (max - min).TotalSeconds;
92            foreach (int index in column.Value) {
93              double rand = r.NextDouble() * randMultiplier;
94              preprocessingData.SetCell<DateTime>(column.Key, index, min.AddSeconds(rand));
95            }
[10612]96          }
[10590]97        }
[10612]98      });
[10249]99    }
[10193]100
[10672]101    public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary<int, IList<int>> cells) {
[10612]102      preprocessingData.InTransaction(() => {
103        foreach (var column in cells) {
[10621]104          int countValues = 0;
[10615]105          if (preprocessingData.IsType<double>(column.Key)) {
[10811]106            countValues = preprocessingData.GetValues<double>(column.Key).Count();
[10621]107          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
[10811]108            countValues = preprocessingData.GetValues<DateTime>(column.Key).Count();
[10621]109          }
[10193]110
[10820]111          IList<Tuple<int, int>> startEndings = GetStartAndEndingsForInterpolation(column);
112          foreach (var tuple in startEndings) {
113            Interpolate(column, tuple.Item1, tuple.Item2);
114          }
115        }
116      });
117    }
118
119    private List<Tuple<int, int>> GetStartAndEndingsForInterpolation(KeyValuePair<int, IList<int>> column) {
120      List<Tuple<int, int>> startEndings = new List<Tuple<int, int>>();
121      var rowIndices = column.Value;
122      rowIndices = rowIndices.OrderBy(x => x).ToList();
123      var count = rowIndices.Count;
124      int start = int.MinValue;
125      for (int i = 0; i < count; ++i) {
126        if (start == int.MinValue) {
127          start = indexOfPrevPresentValue(column.Key, rowIndices[i]);
128        }
129        if (i + 1 == count || (i + 1 < count && rowIndices[i + 1] - rowIndices[i] > 1)) {
130          int next = indexOfNextPresentValue(column.Key, rowIndices[i]);
131          if (start > 0 && next < preprocessingData.Rows) {
132            startEndings.Add(new Tuple<int, int>(start, next));
133          }
134          start = int.MinValue;
135        }
136      }
137      return startEndings;
138    }
139
140    public void ReplaceIndicesBySmoothing(IDictionary<int, IList<int>> cells) {
141      preprocessingData.InTransaction(() => {
142        foreach (var column in cells) {
143          int countValues = preprocessingData.Rows;
144
[10621]145          foreach (int index in column.Value) {
146            // dont replace first or last values
147            if (index > 0 && index < countValues) {
148              int prevIndex = indexOfPrevPresentValue(column.Key, index);
149              int nextIndex = indexOfNextPresentValue(column.Key, index);
150
151              // no neighbours found
[10820]152              if (prevIndex < 0 || nextIndex >= countValues) {
[10621]153                continue;
154              }
155
[10820]156              Interpolate(column, prevIndex, nextIndex);
[10590]157            }
[10249]158          }
[10193]159        }
[10612]160      });
[10249]161    }
[10193]162
[10820]163    private void Interpolate(KeyValuePair<int, IList<int>> column, int prevIndex, int nextIndex) {
164      int valuesToInterpolate = nextIndex - prevIndex;
165
166      if (preprocessingData.IsType<double>(column.Key)) {
167        double prev = preprocessingData.GetCell<double>(column.Key, prevIndex);
168        double next = preprocessingData.GetCell<double>(column.Key, nextIndex);
169        double interpolationStep = (next - prev) / valuesToInterpolate;
170
171        for (int i = prevIndex; i < nextIndex; ++i) {
172          double interpolated = prev + (interpolationStep * (i - prevIndex));
173          preprocessingData.SetCell<double>(column.Key, i, interpolated);
174        }
175      } else if (preprocessingData.IsType<DateTime>(column.Key)) {
176        DateTime prev = preprocessingData.GetCell<DateTime>(column.Key, prevIndex);
177        DateTime next = preprocessingData.GetCell<DateTime>(column.Key, nextIndex);
178        double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
179
180        for (int i = prevIndex; i < nextIndex; ++i) {
181          DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
182          preprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
183        }
184      }
185    }
186
[10367]187    private int indexOfPrevPresentValue(int columnIndex, int start) {
[10249]188      int offset = start - 1;
[10367]189      while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]190        offset--;
191      }
[10234]192
[10249]193      return offset;
194    }
[10234]195
[10367]196    private int indexOfNextPresentValue(int columnIndex, int start) {
[10249]197      int offset = start + 1;
[10367]198      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]199        offset++;
200      }
[10234]201
[10249]202      return offset;
203    }
[10234]204
[10809]205    public void ReplaceIndicesByMostCommonValue(IDictionary<int, IList<int>> cells, bool considerSelection) {
[10612]206      preprocessingData.InTransaction(() => {
207        foreach (var column in cells) {
208          if (preprocessingData.IsType<double>(column.Key)) {
[10809]209            ReplaceIndicesByValue<double>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<double>(column.Key, considerSelection));
[10612]210          } else if (preprocessingData.IsType<string>(column.Key)) {
[10809]211            ReplaceIndicesByValue<string>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<string>(column.Key, considerSelection));
[10612]212          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
[10809]213            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<DateTime>(column.Key, considerSelection));
[10612]214          } else {
215            throw new ArgumentException("column with index: " + column.Key + " contains a non supported type.");
216          }
[10590]217        }
[10612]218      });
[10249]219    }
[10218]220
[10709]221    public void ShuffleWithRanges() {
222      ShuffleWithRanges(new[] {
223        preprocessingData.TestPartition,
224        preprocessingData.TrainingPartition
225      });
226    }
227
[10249]228    public void ShuffleWithRanges(IEnumerable<IntRange> ranges) {
229      // init random outside loop
230      Random random = new Random();
[10218]231
[10612]232      preprocessingData.InTransaction(() => {
[10709]233        // process all given ranges - e.g. TrainingPartition, TestPartition
[10612]234        foreach (IntRange range in ranges) {
235          List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>();
[10218]236
[10612]237          // generate random indices used for shuffeling each column
[10709]238          for (int i = range.End - 1; i >= range.Start; --i) {
[10612]239            int rand = random.Next(range.Start, i);
240            shuffledIndices.Add(new Tuple<int, int>(i, rand));
241          }
242
[10718]243          ShuffleToIndices(shuffledIndices);
[10218]244        }
[10612]245      });
[10253]246    }
247
[10535]248    public void ReOrderToIndices(IEnumerable<int> indices) {
[10256]249      List<Tuple<int, int>> indicesTuple = new List<Tuple<int, int>>();
[10255]250
[10256]251      for (int i = 0; i < indices.Count(); ++i) {
[10311]252        indicesTuple.Add(new Tuple<int, int>(i, indices.ElementAt(i)));
[10256]253      }
254
[10535]255      ReOrderToIndices(indicesTuple);
[10255]256    }
257
[10535]258    public void ReOrderToIndices(IList<System.Tuple<int, int>> indices) {
[10612]259      preprocessingData.InTransaction(() => {
260        for (int i = 0; i < preprocessingData.Columns; ++i) {
261          if (preprocessingData.IsType<double>(i)) {
262            reOrderToIndices<double>(i, indices);
263          } else if (preprocessingData.IsType<string>(i)) {
264            reOrderToIndices<string>(i, indices);
265          } else if (preprocessingData.IsType<DateTime>(i)) {
266            reOrderToIndices<DateTime>(i, indices);
267          }
[10249]268        }
[10612]269      });
[10249]270    }
[10218]271
[10820]272    public void ShuffleToIndices(IList<System.Tuple<int, int>> indices) {
273      preprocessingData.InTransaction(() => {
274        for (int i = 0; i < preprocessingData.Columns; ++i) {
275          if (preprocessingData.IsType<double>(i)) {
[10718]276            ShuffleToIndices<double>(i, indices);
[10820]277          } else if (preprocessingData.IsType<string>(i)) {
[10718]278            ShuffleToIndices<string>(i, indices);
[10820]279          } else if (preprocessingData.IsType<DateTime>(i)) {
[10718]280            ShuffleToIndices<DateTime>(i, indices);
281          }
282        }
283      });
284    }
285
[10367]286    private void reOrderToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices) {
[10308]287
[10811]288      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(columnIndex));
[10308]289
[10249]290      // process all columns equally
291      foreach (Tuple<int, int> index in indices) {
292        int originalIndex = index.Item1;
293        int replaceIndex = index.Item2;
[10218]294
[10308]295        T replaceValue = originalData.ElementAt<T>(replaceIndex);
[10367]296        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
[10249]297      }
[10193]298    }
[10672]299
[10820]300    private void ShuffleToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices) {
[10718]301      // process all columns equally
[10820]302      foreach (Tuple<int, int> index in indices) {
[10718]303        int originalIndex = index.Item1;
304        int replaceIndex = index.Item2;
305
306        T tmp = preprocessingData.GetCell<T>(columnIndex, originalIndex);
307        T replaceValue = preprocessingData.GetCell<T>(columnIndex, replaceIndex);
308
309        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
310        preprocessingData.SetCell<T>(columnIndex, replaceIndex, tmp);
311      }
312    }
313
[10672]314    public void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, string value) {
315      preprocessingData.InTransaction(() => {
316        foreach (var column in cells) {
317          foreach (var rowIdx in column.Value) {
318            dataGridLogic.SetValue(value, column.Key, rowIdx);
319          }
320        }
321      });
322    }
[10711]323
324
[10715]325    public List<int> RowsWithMissingValuesGreater(double percent) {
326
[10820]327      List<int> rows = new List<int>();
[10715]328
[10820]329      for (int i = 0; i < preprocessingData.Rows; ++i) {
[10711]330        int missingCount = statisticsLogic.GetRowMissingValueCount(i);
[10820]331        if (100f / preprocessingData.Columns * missingCount > percent) {
[10715]332          rows.Add(i);
[10711]333        }
334      }
[10715]335
336      return rows;
[10711]337    }
338
[10715]339    public List<int> ColumnsWithMissingValuesGreater(double percent) {
340
341      List<int> columns = new List<int>();
[10737]342      for (int i = 0; i < preprocessingData.Columns; ++i) {
[10711]343        int missingCount = statisticsLogic.GetMissingValueCount(i);
[10737]344        if (100f / preprocessingData.Rows * missingCount > percent) {
[10715]345          columns.Add(i);
[10711]346        }
347      }
[10715]348
349      return columns;
[10711]350    }
351
[10715]352    public List<int> ColumnsWithVarianceSmaller(double variance) {
353
354      List<int> columns = new List<int>();
[10737]355      for (int i = 0; i < preprocessingData.Columns; ++i) {
[10820]356        if (preprocessingData.IsType<double>(i) || preprocessingData.IsType<DateTime>(i)) {
[10711]357          double columnVariance = statisticsLogic.GetVariance(i);
[10820]358          if (columnVariance < variance) {
[10715]359            columns.Add(i);
[10711]360          }
361        }
362      }
[10715]363      return columns;
[10711]364    }
365
[10715]366    public void DeleteRowsWithMissingValuesGreater(double percent) {
367      DeleteRows(RowsWithMissingValuesGreater(percent));
368    }
369
370    public void DeleteColumnsWithMissingValuesGreater(double percent) {
371      DeleteColumns(ColumnsWithMissingValuesGreater(percent));
372    }
373
374    public void DeleteColumnsWithVarianceSmaller(double variance) {
375      DeleteColumns(ColumnsWithVarianceSmaller(variance));
376    }
377
[10737]378    private void DeleteRows(List<int> rows) {
379      rows.Sort();
380      rows.Reverse();
[10820]381      preprocessingData.InTransaction(() => {
382        foreach (int row in rows) {
[10715]383          preprocessingData.DeleteRow(row);
384        }
385      });
386    }
387
[10737]388    private void DeleteColumns(List<int> columns) {
389      columns.Sort();
390      columns.Reverse();
[10820]391      preprocessingData.InTransaction(() => {
392        foreach (int column in columns) {
[10715]393          preprocessingData.DeleteColumn(column);
394        }
395      });
396    }
[10737]397
398    public event DataPreprocessingChangedEventHandler Changed {
399      add { dataGridLogic.Changed += value; }
400      remove { dataGridLogic.Changed -= value; }
401    }
[10249]402  }
[10193]403}
Note: See TracBrowser for help on using the repository browser.