Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/ManipulationLogic.cs @ 10785

Last change on this file since 10785 was 10737, checked in by rstoll, 11 years ago
  • Preview and execution for delete columns/rows with insufficient information
File size: 14.2 KB
RevLine 
[10539]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[10193]23using System.Collections.Generic;
24using System.Linq;
[10249]25using HeuristicLab.Data;
[10193]26
[10249]27namespace HeuristicLab.DataPreprocessing {
[10369]28  public class ManipulationLogic : IManipulationLogic {
[10586]29    private ITransactionalPreprocessingData preprocessingData;
[10615]30    private IStatisticsLogic statisticsLogic;
[10249]31    private ISearchLogic searchLogic;
[10672]32    private IDataGridLogic dataGridLogic;
[10193]33
[10672]34    public ManipulationLogic(ITransactionalPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic, IDataGridLogic theDataGridLogic) {
[10249]35      preprocessingData = _prepocessingData;
36      searchLogic = theSearchLogic;
[10615]37      statisticsLogic = theStatisticsLogic;
[10672]38      dataGridLogic = theDataGridLogic;
[10249]39    }
[10193]40
[10367]41    public void ReplaceIndicesByValue<T>(int columnIndex, IEnumerable<int> rowIndices, T value) {
42      foreach (int index in rowIndices) {
43        preprocessingData.SetCell<T>(columnIndex, index, value);
[10249]44      }
45    }
[10193]46
[10672]47    public void ReplaceIndicesByAverageValue(IDictionary<int, IList<int>> cells) {
[10612]48      preprocessingData.InTransaction(() => {
49        foreach (var column in cells) {
[10615]50          if (preprocessingData.IsType<double>(column.Key)) {
51            double average = statisticsLogic.GetAverage(column.Key);
52            ReplaceIndicesByValue<double>(column.Key, column.Value, average);
53          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
54            DateTime average = statisticsLogic.GetAverageDateTime(column.Key);
55            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, average);
56          }
[10612]57        }
58      });
[10249]59    }
[10193]60
[10672]61    public void ReplaceIndicesByMedianValue(IDictionary<int, IList<int>> cells) {
[10612]62      preprocessingData.InTransaction(() => {
63        foreach (var column in cells) {
[10615]64          if (preprocessingData.IsType<double>(column.Key)) {
65            double median = statisticsLogic.GetMedian(column.Key);
66            ReplaceIndicesByValue<double>(column.Key, column.Value, median);
67          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
68            DateTime median = statisticsLogic.GetMedianDateTime(column.Key);
69            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, median);
70          }
[10612]71        }
72      });
[10249]73    }
[10193]74
[10672]75    public void ReplaceIndicesByRandomValue(IDictionary<int, IList<int>> cells) {
[10612]76      preprocessingData.InTransaction(() => {
77        Random r = new Random();
[10193]78
[10612]79        foreach (var column in cells) {
[10615]80          if (preprocessingData.IsType<double>(column.Key)) {
81            double max = statisticsLogic.GetMax<double>(column.Key);
82            double min = statisticsLogic.GetMin<double>(column.Key);
83            double randMultiplier = (max - min);
84            foreach (int index in column.Value) {
85              double rand = r.NextDouble() * randMultiplier + min;
86              preprocessingData.SetCell<double>(column.Key, index, rand);
87            }
88          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
89            DateTime min = statisticsLogic.GetMin<DateTime>(column.Key);
90            DateTime max = statisticsLogic.GetMax<DateTime>(column.Key);
91            double randMultiplier = (max - min).TotalSeconds;
92            foreach (int index in column.Value) {
93              double rand = r.NextDouble() * randMultiplier;
94              preprocessingData.SetCell<DateTime>(column.Key, index, min.AddSeconds(rand));
95            }
[10612]96          }
[10590]97        }
[10612]98      });
[10249]99    }
[10193]100
[10672]101    public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary<int, IList<int>> cells) {
[10612]102      preprocessingData.InTransaction(() => {
103        foreach (var column in cells) {
[10621]104          int countValues = 0;
[10615]105          if (preprocessingData.IsType<double>(column.Key)) {
[10621]106            countValues = preprocessingData.GetValues<double>(column.Key).Count();
107          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
108            countValues = preprocessingData.GetValues<DateTime>(column.Key).Count();
109          }
[10193]110
[10621]111          foreach (int index in column.Value) {
112            // dont replace first or last values
113            if (index > 0 && index < countValues) {
114              int prevIndex = indexOfPrevPresentValue(column.Key, index);
115              int nextIndex = indexOfNextPresentValue(column.Key, index);
116
117              // no neighbours found
118              if (prevIndex < 0 && nextIndex >= countValues) {
119                continue;
120              }
121
122              int valuesToInterpolate = nextIndex - prevIndex;
123
124              if (preprocessingData.IsType<double>(column.Key)) {
[10615]125                double prev = preprocessingData.GetCell<double>(column.Key, prevIndex);
126                double next = preprocessingData.GetCell<double>(column.Key, nextIndex);
127                double interpolationStep = (next - prev) / valuesToInterpolate;
128
129                for (int i = prevIndex; i < nextIndex; ++i) {
130                  double interpolated = prev + (interpolationStep * (i - prevIndex));
131                  preprocessingData.SetCell<double>(column.Key, i, interpolated);
132                }
[10621]133              } else if (preprocessingData.IsType<DateTime>(column.Key)) {
[10615]134                DateTime prev = preprocessingData.GetCell<DateTime>(column.Key, prevIndex);
135                DateTime next = preprocessingData.GetCell<DateTime>(column.Key, nextIndex);
136                double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
137
138                for (int i = prevIndex; i < nextIndex; ++i) {
139                  DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
140                  preprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
141                }
[10612]142              }
[10590]143            }
[10249]144          }
[10193]145        }
[10612]146      });
[10249]147    }
[10193]148
[10367]149    private int indexOfPrevPresentValue(int columnIndex, int start) {
[10249]150      int offset = start - 1;
[10367]151      while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]152        offset--;
153      }
[10234]154
[10249]155      return offset;
156    }
[10234]157
[10367]158    private int indexOfNextPresentValue(int columnIndex, int start) {
[10249]159      int offset = start + 1;
[10367]160      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]161        offset++;
162      }
[10234]163
[10249]164      return offset;
165    }
[10234]166
[10672]167    public void ReplaceIndicesByMostCommonValue(IDictionary<int, IList<int>> cells) {
[10612]168      preprocessingData.InTransaction(() => {
169        foreach (var column in cells) {
170          if (preprocessingData.IsType<double>(column.Key)) {
[10615]171            ReplaceIndicesByValue<double>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<double>(column.Key));
[10612]172          } else if (preprocessingData.IsType<string>(column.Key)) {
[10615]173            ReplaceIndicesByValue<string>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<string>(column.Key));
[10612]174          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
[10615]175            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<DateTime>(column.Key));
[10612]176          } else {
177            throw new ArgumentException("column with index: " + column.Key + " contains a non supported type.");
178          }
[10590]179        }
[10612]180      });
[10249]181    }
[10218]182
[10709]183    public void ShuffleWithRanges() {
184      ShuffleWithRanges(new[] {
185        preprocessingData.TestPartition,
186        preprocessingData.TrainingPartition
187      });
188    }
189
[10249]190    public void ShuffleWithRanges(IEnumerable<IntRange> ranges) {
191      // init random outside loop
192      Random random = new Random();
[10218]193
[10612]194      preprocessingData.InTransaction(() => {
[10709]195        // process all given ranges - e.g. TrainingPartition, TestPartition
[10612]196        foreach (IntRange range in ranges) {
197          List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>();
[10218]198
[10612]199          // generate random indices used for shuffeling each column
[10709]200          for (int i = range.End - 1; i >= range.Start; --i) {
[10612]201            int rand = random.Next(range.Start, i);
202            shuffledIndices.Add(new Tuple<int, int>(i, rand));
203          }
204
[10718]205          ShuffleToIndices(shuffledIndices);
[10218]206        }
[10612]207      });
[10253]208    }
209
[10535]210    public void ReOrderToIndices(IEnumerable<int> indices) {
[10256]211      List<Tuple<int, int>> indicesTuple = new List<Tuple<int, int>>();
[10255]212
[10256]213      for (int i = 0; i < indices.Count(); ++i) {
[10311]214        indicesTuple.Add(new Tuple<int, int>(i, indices.ElementAt(i)));
[10256]215      }
216
[10535]217      ReOrderToIndices(indicesTuple);
[10255]218    }
219
[10535]220    public void ReOrderToIndices(IList<System.Tuple<int, int>> indices) {
[10612]221      preprocessingData.InTransaction(() => {
222        for (int i = 0; i < preprocessingData.Columns; ++i) {
223          if (preprocessingData.IsType<double>(i)) {
224            reOrderToIndices<double>(i, indices);
225          } else if (preprocessingData.IsType<string>(i)) {
226            reOrderToIndices<string>(i, indices);
227          } else if (preprocessingData.IsType<DateTime>(i)) {
228            reOrderToIndices<DateTime>(i, indices);
229          }
[10249]230        }
[10612]231      });
[10249]232    }
[10218]233
[10718]234    public void ShuffleToIndices(IList<System.Tuple<int, int>> indices)
235    {
236      preprocessingData.InTransaction(() =>
237      {
238        for (int i = 0; i < preprocessingData.Columns; ++i)
239        {
240          if (preprocessingData.IsType<double>(i))
241          {
242            ShuffleToIndices<double>(i, indices);
243          }
244          else if (preprocessingData.IsType<string>(i))
245          {
246            ShuffleToIndices<string>(i, indices);
247          }
248          else if (preprocessingData.IsType<DateTime>(i))
249          {
250            ShuffleToIndices<DateTime>(i, indices);
251          }
252        }
253      });
254    }
255
[10367]256    private void reOrderToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices) {
[10308]257
[10367]258      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(columnIndex));
[10308]259
[10249]260      // process all columns equally
261      foreach (Tuple<int, int> index in indices) {
262        int originalIndex = index.Item1;
263        int replaceIndex = index.Item2;
[10218]264
[10308]265        T replaceValue = originalData.ElementAt<T>(replaceIndex);
[10367]266        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
[10249]267      }
[10193]268    }
[10672]269
[10718]270    private void ShuffleToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices)
271    {
272      // process all columns equally
273      foreach (Tuple<int, int> index in indices)
274      {
275        int originalIndex = index.Item1;
276        int replaceIndex = index.Item2;
277
278        T tmp = preprocessingData.GetCell<T>(columnIndex, originalIndex);
279        T replaceValue = preprocessingData.GetCell<T>(columnIndex, replaceIndex);
280
281        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
282        preprocessingData.SetCell<T>(columnIndex, replaceIndex, tmp);
283      }
284    }
285
[10672]286    public void ReplaceIndicesByValue(IDictionary<int, IList<int>> cells, string value) {
287      preprocessingData.InTransaction(() => {
288        foreach (var column in cells) {
289          foreach (var rowIdx in column.Value) {
290            dataGridLogic.SetValue(value, column.Key, rowIdx);
291          }
292        }
293      });
294    }
[10711]295
296
[10715]297    public List<int> RowsWithMissingValuesGreater(double percent) {
298
299      List<int> rows= new List<int>();
300
[10737]301      for (int i = 0; i < preprocessingData.Rows; ++i)
[10715]302      {
[10711]303        int missingCount = statisticsLogic.GetRowMissingValueCount(i);
[10737]304        if (100f / preprocessingData.Columns * missingCount > percent)
[10715]305        {
306          rows.Add(i);
[10711]307        }
308      }
[10715]309
310      return rows;
[10711]311    }
312
[10715]313    public List<int> ColumnsWithMissingValuesGreater(double percent) {
314
315      List<int> columns = new List<int>();
[10737]316      for (int i = 0; i < preprocessingData.Columns; ++i) {
[10711]317        int missingCount = statisticsLogic.GetMissingValueCount(i);
[10737]318        if (100f / preprocessingData.Rows * missingCount > percent) {
[10715]319          columns.Add(i);
[10711]320        }
321      }
[10715]322
323      return columns;
[10711]324    }
325
[10715]326    public List<int> ColumnsWithVarianceSmaller(double variance) {
327
328      List<int> columns = new List<int>();
[10737]329      for (int i = 0; i < preprocessingData.Columns; ++i) {
[10715]330        if (preprocessingData.IsType<double>(i) || preprocessingData.IsType<DateTime>(i))
331        {
[10711]332          double columnVariance = statisticsLogic.GetVariance(i);
[10715]333          if (columnVariance < variance)
334          {
335            columns.Add(i);
[10711]336          }
337        }
338      }
[10715]339      return columns;
[10711]340    }
341
[10715]342    public void DeleteRowsWithMissingValuesGreater(double percent) {
343      DeleteRows(RowsWithMissingValuesGreater(percent));
344    }
345
346    public void DeleteColumnsWithMissingValuesGreater(double percent) {
347      DeleteColumns(ColumnsWithMissingValuesGreater(percent));
348    }
349
350    public void DeleteColumnsWithVarianceSmaller(double variance) {
351      DeleteColumns(ColumnsWithVarianceSmaller(variance));
352    }
353
[10737]354    private void DeleteRows(List<int> rows) {
355      rows.Sort();
356      rows.Reverse();
[10715]357      preprocessingData.InTransaction(() =>
358      {
359        foreach (int row in rows)
360        {
361          preprocessingData.DeleteRow(row);
362        }
363      });
364    }
365
[10737]366    private void DeleteColumns(List<int> columns) {
367      columns.Sort();
368      columns.Reverse();
[10715]369      preprocessingData.InTransaction(() =>
370      {
371        foreach (int column in columns)
372        {
373          preprocessingData.DeleteColumn(column);
374        }
375      });
376    }
[10737]377
378    public event DataPreprocessingChangedEventHandler Changed {
379      add { dataGridLogic.Changed += value; }
380      remove { dataGridLogic.Changed -= value; }
381    }
[10249]382  }
[10193]383}
Note: See TracBrowser for help on using the repository browser.