Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataPreprocessing/3.4/Logic/ManipulationLogic.cs @ 15094

Last change on this file since 15094 was 14886, checked in by mkommend, 7 years ago

#2778: Refactored and corrected shuffling in DataPreprocessing.

File size: 14.7 KB
RevLine 
[10539]1#region License Information
2/* HeuristicLab
[14185]3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[10539]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[10193]23using System.Collections.Generic;
24using System.Linq;
[10249]25using HeuristicLab.Data;
[14886]26using HeuristicLab.Random;
[10193]27
[10249]28namespace HeuristicLab.DataPreprocessing {
[13508]29  public class ManipulationLogic {
[11070]30    private readonly ITransactionalPreprocessingData preprocessingData;
[13508]31    private readonly StatisticsLogic statisticsLogic;
32    private readonly SearchLogic searchLogic;
[10193]33
[11002]34    public IEnumerable<string> VariableNames {
35      get { return preprocessingData.VariableNames; }
36    }
37
38    public ITransactionalPreprocessingData PreProcessingData {
39      get { return preprocessingData; }
40    }
41
[13508]42    public ManipulationLogic(ITransactionalPreprocessingData _prepocessingData, SearchLogic theSearchLogic, StatisticsLogic theStatisticsLogic) {
[10249]43      preprocessingData = _prepocessingData;
44      searchLogic = theSearchLogic;
[10615]45      statisticsLogic = theStatisticsLogic;
[10249]46    }
[10193]47
[10367]48    public void ReplaceIndicesByValue<T>(int columnIndex, IEnumerable<int> rowIndices, T value) {
49      foreach (int index in rowIndices) {
50        preprocessingData.SetCell<T>(columnIndex, index, value);
[10249]51      }
52    }
[10193]53
[13508]54    public void ReplaceIndicesByAverageValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
[10612]55      preprocessingData.InTransaction(() => {
56        foreach (var column in cells) {
[11156]57          if (preprocessingData.VariableHasType<double>(column.Key)) {
[10809]58            double average = statisticsLogic.GetAverage(column.Key, considerSelection);
[10615]59            ReplaceIndicesByValue<double>(column.Key, column.Value, average);
[11156]60          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[10809]61            DateTime average = statisticsLogic.GetAverageDateTime(column.Key, considerSelection);
[10615]62            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, average);
63          }
[10612]64        }
65      });
[10249]66    }
[10193]67
[13508]68    public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
[10612]69      preprocessingData.InTransaction(() => {
70        foreach (var column in cells) {
[11156]71          if (preprocessingData.VariableHasType<double>(column.Key)) {
[10809]72            double median = statisticsLogic.GetMedian(column.Key, considerSelection);
[10615]73            ReplaceIndicesByValue<double>(column.Key, column.Value, median);
[11156]74          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[10809]75            DateTime median = statisticsLogic.GetMedianDateTime(column.Key, considerSelection);
[10615]76            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, median);
77          }
[10612]78        }
79      });
[10249]80    }
[10193]81
[13508]82    public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
[10612]83      preprocessingData.InTransaction(() => {
[14886]84        System.Random r = new System.Random();
[10193]85
[10612]86        foreach (var column in cells) {
[11156]87          if (preprocessingData.VariableHasType<double>(column.Key)) {
[13935]88            double max = statisticsLogic.GetMax<double>(column.Key, double.NaN, considerSelection);
89            double min = statisticsLogic.GetMin<double>(column.Key, double.NaN, considerSelection);
[10615]90            double randMultiplier = (max - min);
91            foreach (int index in column.Value) {
92              double rand = r.NextDouble() * randMultiplier + min;
93              preprocessingData.SetCell<double>(column.Key, index, rand);
94            }
[11156]95          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[13935]96            DateTime min = statisticsLogic.GetMin<DateTime>(column.Key, DateTime.MinValue, considerSelection);
97            DateTime max = statisticsLogic.GetMax<DateTime>(column.Key, DateTime.MinValue, considerSelection);
[10615]98            double randMultiplier = (max - min).TotalSeconds;
99            foreach (int index in column.Value) {
100              double rand = r.NextDouble() * randMultiplier;
101              preprocessingData.SetCell<DateTime>(column.Key, index, min.AddSeconds(rand));
102            }
[10612]103          }
[10590]104        }
[10612]105      });
[10249]106    }
[10193]107
[10672]108    public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary<int, IList<int>> cells) {
[10612]109      preprocessingData.InTransaction(() => {
110        foreach (var column in cells) {
[10621]111          int countValues = 0;
[11156]112          if (preprocessingData.VariableHasType<double>(column.Key)) {
[10811]113            countValues = preprocessingData.GetValues<double>(column.Key).Count();
[11156]114          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[10811]115            countValues = preprocessingData.GetValues<DateTime>(column.Key).Count();
[10621]116          }
[10193]117
[10820]118          IList<Tuple<int, int>> startEndings = GetStartAndEndingsForInterpolation(column);
119          foreach (var tuple in startEndings) {
120            Interpolate(column, tuple.Item1, tuple.Item2);
121          }
122        }
123      });
124    }
125
126    private List<Tuple<int, int>> GetStartAndEndingsForInterpolation(KeyValuePair<int, IList<int>> column) {
127      List<Tuple<int, int>> startEndings = new List<Tuple<int, int>>();
128      var rowIndices = column.Value;
129      rowIndices = rowIndices.OrderBy(x => x).ToList();
130      var count = rowIndices.Count;
131      int start = int.MinValue;
132      for (int i = 0; i < count; ++i) {
133        if (start == int.MinValue) {
134          start = indexOfPrevPresentValue(column.Key, rowIndices[i]);
135        }
136        if (i + 1 == count || (i + 1 < count && rowIndices[i + 1] - rowIndices[i] > 1)) {
137          int next = indexOfNextPresentValue(column.Key, rowIndices[i]);
138          if (start > 0 && next < preprocessingData.Rows) {
139            startEndings.Add(new Tuple<int, int>(start, next));
140          }
141          start = int.MinValue;
142        }
143      }
144      return startEndings;
145    }
146
147    public void ReplaceIndicesBySmoothing(IDictionary<int, IList<int>> cells) {
148      preprocessingData.InTransaction(() => {
149        foreach (var column in cells) {
150          int countValues = preprocessingData.Rows;
151
[10621]152          foreach (int index in column.Value) {
153            // dont replace first or last values
154            if (index > 0 && index < countValues) {
155              int prevIndex = indexOfPrevPresentValue(column.Key, index);
156              int nextIndex = indexOfNextPresentValue(column.Key, index);
157
158              // no neighbours found
[10820]159              if (prevIndex < 0 || nextIndex >= countValues) {
[10621]160                continue;
161              }
162
[10820]163              Interpolate(column, prevIndex, nextIndex);
[10590]164            }
[10249]165          }
[10193]166        }
[10612]167      });
[10249]168    }
[10193]169
[10820]170    private void Interpolate(KeyValuePair<int, IList<int>> column, int prevIndex, int nextIndex) {
171      int valuesToInterpolate = nextIndex - prevIndex;
172
[11156]173      if (preprocessingData.VariableHasType<double>(column.Key)) {
[10820]174        double prev = preprocessingData.GetCell<double>(column.Key, prevIndex);
175        double next = preprocessingData.GetCell<double>(column.Key, nextIndex);
176        double interpolationStep = (next - prev) / valuesToInterpolate;
177
178        for (int i = prevIndex; i < nextIndex; ++i) {
179          double interpolated = prev + (interpolationStep * (i - prevIndex));
180          preprocessingData.SetCell<double>(column.Key, i, interpolated);
181        }
[11156]182      } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[10820]183        DateTime prev = preprocessingData.GetCell<DateTime>(column.Key, prevIndex);
184        DateTime next = preprocessingData.GetCell<DateTime>(column.Key, nextIndex);
185        double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
186
187        for (int i = prevIndex; i < nextIndex; ++i) {
188          DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
189          preprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
190        }
191      }
192    }
193
[10367]194    private int indexOfPrevPresentValue(int columnIndex, int start) {
[10249]195      int offset = start - 1;
[10367]196      while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]197        offset--;
198      }
[10234]199
[10249]200      return offset;
201    }
[10234]202
[10367]203    private int indexOfNextPresentValue(int columnIndex, int start) {
[10249]204      int offset = start + 1;
[10367]205      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]206        offset++;
207      }
[10234]208
[10249]209      return offset;
210    }
[10234]211
[13508]212    public void ReplaceIndicesByMostCommonValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
[10612]213      preprocessingData.InTransaction(() => {
214        foreach (var column in cells) {
[11156]215          if (preprocessingData.VariableHasType<double>(column.Key)) {
[13935]216            ReplaceIndicesByValue<double>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<double>(column.Key, double.NaN, considerSelection));
[11156]217          } else if (preprocessingData.VariableHasType<string>(column.Key)) {
[13935]218            ReplaceIndicesByValue<string>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<string>(column.Key, string.Empty, considerSelection));
[11156]219          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[13935]220            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<DateTime>(column.Key, DateTime.MinValue, considerSelection));
[10612]221          } else {
222            throw new ArgumentException("column with index: " + column.Key + " contains a non supported type.");
223          }
[10590]224        }
[10612]225      });
[10249]226    }
[10218]227
[11403]228    public void Shuffle(bool shuffleRangesSeparately) {
[14886]229      var random = new FastRandom();
230
[11380]231      if (shuffleRangesSeparately) {
[14886]232        var ranges = new[] { preprocessingData.TestPartition, preprocessingData.TrainingPartition };
[11380]233        preprocessingData.InTransaction(() => {
234          // process all given ranges - e.g. TrainingPartition, TestPartition
235          foreach (IntRange range in ranges) {
[14886]236            var indices = Enumerable.Range(0, preprocessingData.Rows).ToArray();
237            var shuffledIndices = Enumerable.Range(range.Start, range.Size).Shuffle(random).ToArray();
238            for (int i = range.Start, j = 0; i < range.End; i++, j++)
239              indices[i] = shuffledIndices[j];
[10218]240
[14886]241            ReOrderToIndices(indices);
[10612]242          }
[11380]243        });
[14886]244
[11380]245      } else {
246        preprocessingData.InTransaction(() => {
[14886]247          var indices = Enumerable.Range(0, preprocessingData.Rows);
248          var shuffledIndices = indices.Shuffle(random).ToArray();
249          ReOrderToIndices(shuffledIndices);
[11380]250        });
251      }
[10253]252    }
253
[14886]254    public void ReOrderToIndices(int[] indices) {
[10612]255      preprocessingData.InTransaction(() => {
256        for (int i = 0; i < preprocessingData.Columns; ++i) {
[11156]257          if (preprocessingData.VariableHasType<double>(i)) {
[14886]258            ReOrderToIndices<double>(i, indices);
[11156]259          } else if (preprocessingData.VariableHasType<string>(i)) {
[14886]260            ReOrderToIndices<string>(i, indices);
[11156]261          } else if (preprocessingData.VariableHasType<DateTime>(i)) {
[14886]262            ReOrderToIndices<DateTime>(i, indices);
[10612]263          }
[10249]264        }
[10612]265      });
[10249]266    }
[10218]267
[14886]268    private void ReOrderToIndices<T>(int columnIndex, int[] indices) {
[10811]269      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(columnIndex));
[14886]270      if (indices.Length != originalData.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
[10308]271
[14886]272      for (int i = 0; i < indices.Length; i++) {
273        int originalIndex = i;
274        int replaceIndex = indices[i];
[10218]275
[10308]276        T replaceValue = originalData.ElementAt<T>(replaceIndex);
[10367]277        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
[10249]278      }
[10193]279    }
[10672]280
281    public void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, string value) {
282      preprocessingData.InTransaction(() => {
283        foreach (var column in cells) {
284          foreach (var rowIdx in column.Value) {
[11002]285            preprocessingData.SetValue(value, column.Key, rowIdx);
[10672]286          }
287        }
288      });
289    }
[10711]290
291
[10715]292    public List<int> RowsWithMissingValuesGreater(double percent) {
293
[10820]294      List<int> rows = new List<int>();
[10715]295
[10820]296      for (int i = 0; i < preprocessingData.Rows; ++i) {
[10711]297        int missingCount = statisticsLogic.GetRowMissingValueCount(i);
[10820]298        if (100f / preprocessingData.Columns * missingCount > percent) {
[10715]299          rows.Add(i);
[10711]300        }
301      }
[10715]302
303      return rows;
[10711]304    }
305
[10715]306    public List<int> ColumnsWithMissingValuesGreater(double percent) {
307
308      List<int> columns = new List<int>();
[10737]309      for (int i = 0; i < preprocessingData.Columns; ++i) {
[10711]310        int missingCount = statisticsLogic.GetMissingValueCount(i);
[10737]311        if (100f / preprocessingData.Rows * missingCount > percent) {
[10715]312          columns.Add(i);
[10711]313        }
314      }
[10715]315
316      return columns;
[10711]317    }
318
[10715]319    public List<int> ColumnsWithVarianceSmaller(double variance) {
320
321      List<int> columns = new List<int>();
[10737]322      for (int i = 0; i < preprocessingData.Columns; ++i) {
[11156]323        if (preprocessingData.VariableHasType<double>(i) || preprocessingData.VariableHasType<DateTime>(i)) {
[10711]324          double columnVariance = statisticsLogic.GetVariance(i);
[10820]325          if (columnVariance < variance) {
[10715]326            columns.Add(i);
[10711]327          }
328        }
329      }
[10715]330      return columns;
[10711]331    }
332
[10715]333    public void DeleteRowsWithMissingValuesGreater(double percent) {
334      DeleteRows(RowsWithMissingValuesGreater(percent));
335    }
336
337    public void DeleteColumnsWithMissingValuesGreater(double percent) {
338      DeleteColumns(ColumnsWithMissingValuesGreater(percent));
339    }
340
341    public void DeleteColumnsWithVarianceSmaller(double variance) {
342      DeleteColumns(ColumnsWithVarianceSmaller(variance));
343    }
344
[10737]345    private void DeleteRows(List<int> rows) {
346      rows.Sort();
347      rows.Reverse();
[10820]348      preprocessingData.InTransaction(() => {
349        foreach (int row in rows) {
[10715]350          preprocessingData.DeleteRow(row);
351        }
352      });
353    }
354
[10737]355    private void DeleteColumns(List<int> columns) {
356      columns.Sort();
357      columns.Reverse();
[10820]358      preprocessingData.InTransaction(() => {
359        foreach (int column in columns) {
[10715]360          preprocessingData.DeleteColumn(column);
361        }
362      });
363    }
[10737]364
[10249]365  }
[10193]366}
Note: See TracBrowser for help on using the repository browser.