Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataPreprocessing/3.4/Logic/ManipulationLogic.cs @ 14525

Last change on this file since 14525 was 14185, checked in by swagner, 9 years ago

#2526: Updated year of copyrights in license headers

File size: 16.2 KB
RevLine 
[10539]1#region License Information
2/* HeuristicLab
[14185]3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[10539]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[10193]23using System.Collections.Generic;
24using System.Linq;
[10249]25using HeuristicLab.Data;
[10193]26
[10249]27namespace HeuristicLab.DataPreprocessing {
[13508]28  public class ManipulationLogic {
[11070]29    private readonly ITransactionalPreprocessingData preprocessingData;
[13508]30    private readonly StatisticsLogic statisticsLogic;
31    private readonly SearchLogic searchLogic;
[10193]32
[11002]33    public IEnumerable<string> VariableNames {
34      get { return preprocessingData.VariableNames; }
35    }
36
37    public ITransactionalPreprocessingData PreProcessingData {
38      get { return preprocessingData; }
39    }
40
[13508]41    public ManipulationLogic(ITransactionalPreprocessingData _prepocessingData, SearchLogic theSearchLogic, StatisticsLogic theStatisticsLogic) {
[10249]42      preprocessingData = _prepocessingData;
43      searchLogic = theSearchLogic;
[10615]44      statisticsLogic = theStatisticsLogic;
[10249]45    }
[10193]46
[10367]47    public void ReplaceIndicesByValue<T>(int columnIndex, IEnumerable<int> rowIndices, T value) {
48      foreach (int index in rowIndices) {
49        preprocessingData.SetCell<T>(columnIndex, index, value);
[10249]50      }
51    }
[10193]52
[13508]53    public void ReplaceIndicesByAverageValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
[10612]54      preprocessingData.InTransaction(() => {
55        foreach (var column in cells) {
[11156]56          if (preprocessingData.VariableHasType<double>(column.Key)) {
[10809]57            double average = statisticsLogic.GetAverage(column.Key, considerSelection);
[10615]58            ReplaceIndicesByValue<double>(column.Key, column.Value, average);
[11156]59          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[10809]60            DateTime average = statisticsLogic.GetAverageDateTime(column.Key, considerSelection);
[10615]61            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, average);
62          }
[10612]63        }
64      });
[10249]65    }
[10193]66
[13508]67    public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
[10612]68      preprocessingData.InTransaction(() => {
69        foreach (var column in cells) {
[11156]70          if (preprocessingData.VariableHasType<double>(column.Key)) {
[10809]71            double median = statisticsLogic.GetMedian(column.Key, considerSelection);
[10615]72            ReplaceIndicesByValue<double>(column.Key, column.Value, median);
[11156]73          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[10809]74            DateTime median = statisticsLogic.GetMedianDateTime(column.Key, considerSelection);
[10615]75            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, median);
76          }
[10612]77        }
78      });
[10249]79    }
[10193]80
[13508]81    public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
[10612]82      preprocessingData.InTransaction(() => {
83        Random r = new Random();
[10193]84
[10612]85        foreach (var column in cells) {
[11156]86          if (preprocessingData.VariableHasType<double>(column.Key)) {
[13935]87            double max = statisticsLogic.GetMax<double>(column.Key, double.NaN, considerSelection);
88            double min = statisticsLogic.GetMin<double>(column.Key, double.NaN, considerSelection);
[10615]89            double randMultiplier = (max - min);
90            foreach (int index in column.Value) {
91              double rand = r.NextDouble() * randMultiplier + min;
92              preprocessingData.SetCell<double>(column.Key, index, rand);
93            }
[11156]94          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[13935]95            DateTime min = statisticsLogic.GetMin<DateTime>(column.Key, DateTime.MinValue, considerSelection);
96            DateTime max = statisticsLogic.GetMax<DateTime>(column.Key, DateTime.MinValue, considerSelection);
[10615]97            double randMultiplier = (max - min).TotalSeconds;
98            foreach (int index in column.Value) {
99              double rand = r.NextDouble() * randMultiplier;
100              preprocessingData.SetCell<DateTime>(column.Key, index, min.AddSeconds(rand));
101            }
[10612]102          }
[10590]103        }
[10612]104      });
[10249]105    }
[10193]106
[10672]107    public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary<int, IList<int>> cells) {
[10612]108      preprocessingData.InTransaction(() => {
109        foreach (var column in cells) {
[10621]110          int countValues = 0;
[11156]111          if (preprocessingData.VariableHasType<double>(column.Key)) {
[10811]112            countValues = preprocessingData.GetValues<double>(column.Key).Count();
[11156]113          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[10811]114            countValues = preprocessingData.GetValues<DateTime>(column.Key).Count();
[10621]115          }
[10193]116
[10820]117          IList<Tuple<int, int>> startEndings = GetStartAndEndingsForInterpolation(column);
118          foreach (var tuple in startEndings) {
119            Interpolate(column, tuple.Item1, tuple.Item2);
120          }
121        }
122      });
123    }
124
125    private List<Tuple<int, int>> GetStartAndEndingsForInterpolation(KeyValuePair<int, IList<int>> column) {
126      List<Tuple<int, int>> startEndings = new List<Tuple<int, int>>();
127      var rowIndices = column.Value;
128      rowIndices = rowIndices.OrderBy(x => x).ToList();
129      var count = rowIndices.Count;
130      int start = int.MinValue;
131      for (int i = 0; i < count; ++i) {
132        if (start == int.MinValue) {
133          start = indexOfPrevPresentValue(column.Key, rowIndices[i]);
134        }
135        if (i + 1 == count || (i + 1 < count && rowIndices[i + 1] - rowIndices[i] > 1)) {
136          int next = indexOfNextPresentValue(column.Key, rowIndices[i]);
137          if (start > 0 && next < preprocessingData.Rows) {
138            startEndings.Add(new Tuple<int, int>(start, next));
139          }
140          start = int.MinValue;
141        }
142      }
143      return startEndings;
144    }
145
146    public void ReplaceIndicesBySmoothing(IDictionary<int, IList<int>> cells) {
147      preprocessingData.InTransaction(() => {
148        foreach (var column in cells) {
149          int countValues = preprocessingData.Rows;
150
[10621]151          foreach (int index in column.Value) {
152            // dont replace first or last values
153            if (index > 0 && index < countValues) {
154              int prevIndex = indexOfPrevPresentValue(column.Key, index);
155              int nextIndex = indexOfNextPresentValue(column.Key, index);
156
157              // no neighbours found
[10820]158              if (prevIndex < 0 || nextIndex >= countValues) {
[10621]159                continue;
160              }
161
[10820]162              Interpolate(column, prevIndex, nextIndex);
[10590]163            }
[10249]164          }
[10193]165        }
[10612]166      });
[10249]167    }
[10193]168
[10820]169    private void Interpolate(KeyValuePair<int, IList<int>> column, int prevIndex, int nextIndex) {
170      int valuesToInterpolate = nextIndex - prevIndex;
171
[11156]172      if (preprocessingData.VariableHasType<double>(column.Key)) {
[10820]173        double prev = preprocessingData.GetCell<double>(column.Key, prevIndex);
174        double next = preprocessingData.GetCell<double>(column.Key, nextIndex);
175        double interpolationStep = (next - prev) / valuesToInterpolate;
176
177        for (int i = prevIndex; i < nextIndex; ++i) {
178          double interpolated = prev + (interpolationStep * (i - prevIndex));
179          preprocessingData.SetCell<double>(column.Key, i, interpolated);
180        }
[11156]181      } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[10820]182        DateTime prev = preprocessingData.GetCell<DateTime>(column.Key, prevIndex);
183        DateTime next = preprocessingData.GetCell<DateTime>(column.Key, nextIndex);
184        double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
185
186        for (int i = prevIndex; i < nextIndex; ++i) {
187          DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
188          preprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
189        }
190      }
191    }
192
[10367]193    private int indexOfPrevPresentValue(int columnIndex, int start) {
[10249]194      int offset = start - 1;
[10367]195      while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]196        offset--;
197      }
[10234]198
[10249]199      return offset;
200    }
[10234]201
[10367]202    private int indexOfNextPresentValue(int columnIndex, int start) {
[10249]203      int offset = start + 1;
[10367]204      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]205        offset++;
206      }
[10234]207
[10249]208      return offset;
209    }
[10234]210
[13508]211    public void ReplaceIndicesByMostCommonValue(IDictionary<int, IList<int>> cells, bool considerSelection = false) {
[10612]212      preprocessingData.InTransaction(() => {
213        foreach (var column in cells) {
[11156]214          if (preprocessingData.VariableHasType<double>(column.Key)) {
[13935]215            ReplaceIndicesByValue<double>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<double>(column.Key, double.NaN, considerSelection));
[11156]216          } else if (preprocessingData.VariableHasType<string>(column.Key)) {
[13935]217            ReplaceIndicesByValue<string>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<string>(column.Key, string.Empty, considerSelection));
[11156]218          } else if (preprocessingData.VariableHasType<DateTime>(column.Key)) {
[13935]219            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<DateTime>(column.Key, DateTime.MinValue, considerSelection));
[10612]220          } else {
221            throw new ArgumentException("column with index: " + column.Key + " contains a non supported type.");
222          }
[10590]223        }
[10612]224      });
[10249]225    }
[10218]226
[11403]227    public void Shuffle(bool shuffleRangesSeparately) {
228      Random random = new Random();
[11380]229      var ranges = new[] { preprocessingData.TestPartition, preprocessingData.TrainingPartition };
230      if (shuffleRangesSeparately) {
231        preprocessingData.InTransaction(() => {
232          // process all given ranges - e.g. TrainingPartition, TestPartition
233          foreach (IntRange range in ranges) {
234            List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>();
[10218]235
[11380]236            // generate random indices used for shuffeling each column
237            for (int i = range.End - 1; i >= range.Start; --i) {
238              int rand = random.Next(range.Start, i);
239              shuffledIndices.Add(new Tuple<int, int>(i, rand));
240            }
241
242            ShuffleToIndices(shuffledIndices);
[10612]243          }
[11380]244        });
245      } else {
246        preprocessingData.InTransaction(() => {
247          var indices = ranges.SelectMany(x => Enumerable.Range(x.Start, x.Size)).ToList();
248          var shuffledIndices = indices.OrderBy(x => random.Next());
249          ShuffleToIndices(indices.Zip(shuffledIndices, (i, j) => new Tuple<int, int>(i, j)).ToList());
250        });
251      }
[10253]252    }
253
[10535]254    public void ReOrderToIndices(IEnumerable<int> indices) {
[10256]255      List<Tuple<int, int>> indicesTuple = new List<Tuple<int, int>>();
[10255]256
[10256]257      for (int i = 0; i < indices.Count(); ++i) {
[10311]258        indicesTuple.Add(new Tuple<int, int>(i, indices.ElementAt(i)));
[10256]259      }
260
[10535]261      ReOrderToIndices(indicesTuple);
[10255]262    }
263
[10535]264    public void ReOrderToIndices(IList<System.Tuple<int, int>> indices) {
[10612]265      preprocessingData.InTransaction(() => {
266        for (int i = 0; i < preprocessingData.Columns; ++i) {
[11156]267          if (preprocessingData.VariableHasType<double>(i)) {
[10612]268            reOrderToIndices<double>(i, indices);
[11156]269          } else if (preprocessingData.VariableHasType<string>(i)) {
[10612]270            reOrderToIndices<string>(i, indices);
[11156]271          } else if (preprocessingData.VariableHasType<DateTime>(i)) {
[10612]272            reOrderToIndices<DateTime>(i, indices);
273          }
[10249]274        }
[10612]275      });
[10249]276    }
[10218]277
[10820]278    public void ShuffleToIndices(IList<System.Tuple<int, int>> indices) {
279      preprocessingData.InTransaction(() => {
280        for (int i = 0; i < preprocessingData.Columns; ++i) {
[11156]281          if (preprocessingData.VariableHasType<double>(i)) {
[10718]282            ShuffleToIndices<double>(i, indices);
[11156]283          } else if (preprocessingData.VariableHasType<string>(i)) {
[10718]284            ShuffleToIndices<string>(i, indices);
[11156]285          } else if (preprocessingData.VariableHasType<DateTime>(i)) {
[10718]286            ShuffleToIndices<DateTime>(i, indices);
287          }
288        }
289      });
290    }
291
[10367]292    private void reOrderToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices) {
[10308]293
[10811]294      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(columnIndex));
[10308]295
[10249]296      // process all columns equally
297      foreach (Tuple<int, int> index in indices) {
298        int originalIndex = index.Item1;
299        int replaceIndex = index.Item2;
[10218]300
[10308]301        T replaceValue = originalData.ElementAt<T>(replaceIndex);
[10367]302        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
[10249]303      }
[10193]304    }
[10672]305
[10820]306    private void ShuffleToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices) {
[10718]307      // process all columns equally
[10820]308      foreach (Tuple<int, int> index in indices) {
[10718]309        int originalIndex = index.Item1;
310        int replaceIndex = index.Item2;
311
312        T tmp = preprocessingData.GetCell<T>(columnIndex, originalIndex);
313        T replaceValue = preprocessingData.GetCell<T>(columnIndex, replaceIndex);
314
315        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
316        preprocessingData.SetCell<T>(columnIndex, replaceIndex, tmp);
317      }
318    }
319
[10672]320    public void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, string value) {
321      preprocessingData.InTransaction(() => {
322        foreach (var column in cells) {
323          foreach (var rowIdx in column.Value) {
[11002]324            preprocessingData.SetValue(value, column.Key, rowIdx);
[10672]325          }
326        }
327      });
328    }
[10711]329
330
[10715]331    public List<int> RowsWithMissingValuesGreater(double percent) {
332
[10820]333      List<int> rows = new List<int>();
[10715]334
[10820]335      for (int i = 0; i < preprocessingData.Rows; ++i) {
[10711]336        int missingCount = statisticsLogic.GetRowMissingValueCount(i);
[10820]337        if (100f / preprocessingData.Columns * missingCount > percent) {
[10715]338          rows.Add(i);
[10711]339        }
340      }
[10715]341
342      return rows;
[10711]343    }
344
[10715]345    public List<int> ColumnsWithMissingValuesGreater(double percent) {
346
347      List<int> columns = new List<int>();
[10737]348      for (int i = 0; i < preprocessingData.Columns; ++i) {
[10711]349        int missingCount = statisticsLogic.GetMissingValueCount(i);
[10737]350        if (100f / preprocessingData.Rows * missingCount > percent) {
[10715]351          columns.Add(i);
[10711]352        }
353      }
[10715]354
355      return columns;
[10711]356    }
357
[10715]358    public List<int> ColumnsWithVarianceSmaller(double variance) {
359
360      List<int> columns = new List<int>();
[10737]361      for (int i = 0; i < preprocessingData.Columns; ++i) {
[11156]362        if (preprocessingData.VariableHasType<double>(i) || preprocessingData.VariableHasType<DateTime>(i)) {
[10711]363          double columnVariance = statisticsLogic.GetVariance(i);
[10820]364          if (columnVariance < variance) {
[10715]365            columns.Add(i);
[10711]366          }
367        }
368      }
[10715]369      return columns;
[10711]370    }
371
[10715]372    public void DeleteRowsWithMissingValuesGreater(double percent) {
373      DeleteRows(RowsWithMissingValuesGreater(percent));
374    }
375
376    public void DeleteColumnsWithMissingValuesGreater(double percent) {
377      DeleteColumns(ColumnsWithMissingValuesGreater(percent));
378    }
379
380    public void DeleteColumnsWithVarianceSmaller(double variance) {
381      DeleteColumns(ColumnsWithVarianceSmaller(variance));
382    }
383
[10737]384    private void DeleteRows(List<int> rows) {
385      rows.Sort();
386      rows.Reverse();
[10820]387      preprocessingData.InTransaction(() => {
388        foreach (int row in rows) {
[10715]389          preprocessingData.DeleteRow(row);
390        }
391      });
392    }
393
[10737]394    private void DeleteColumns(List<int> columns) {
395      columns.Sort();
396      columns.Reverse();
[10820]397      preprocessingData.InTransaction(() => {
398        foreach (int column in columns) {
[10715]399          preprocessingData.DeleteColumn(column);
400        }
401      });
402    }
[10737]403
[10249]404  }
[10193]405}
Note: See TracBrowser for help on using the repository browser.