Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/ManipulationLogic.cs @ 10615

Last change on this file since 10615 was 10615, checked in by sbreuer, 10 years ago
  • allow replace value with ... functions for date time
File size: 10.3 KB
RevLine 
[10539]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[10193]23using System.Collections.Generic;
24using System.Linq;
[10249]25using HeuristicLab.Data;
[10193]26
[10249]27namespace HeuristicLab.DataPreprocessing {
[10369]28  public class ManipulationLogic : IManipulationLogic {
[10586]29    private ITransactionalPreprocessingData preprocessingData;
[10615]30    private IStatisticsLogic statisticsLogic;
[10249]31    private ISearchLogic searchLogic;
[10193]32
[10586]33    public ManipulationLogic(ITransactionalPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic) {
[10249]34      preprocessingData = _prepocessingData;
35      searchLogic = theSearchLogic;
[10615]36      statisticsLogic = theStatisticsLogic;
[10249]37    }
[10193]38
[10367]39    public void ReplaceIndicesByValue<T>(int columnIndex, IEnumerable<int> rowIndices, T value) {
40      foreach (int index in rowIndices) {
41        preprocessingData.SetCell<T>(columnIndex, index, value);
[10249]42      }
43    }
[10193]44
[10590]45    public void ReplaceIndicesByAverageValue(Dictionary<int, List<int>> cells) {
[10612]46      preprocessingData.InTransaction(() => {
47        foreach (var column in cells) {
[10615]48          if (preprocessingData.IsType<double>(column.Key)) {
49            double average = statisticsLogic.GetAverage(column.Key);
50            ReplaceIndicesByValue<double>(column.Key, column.Value, average);
51          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
52            DateTime average = statisticsLogic.GetAverageDateTime(column.Key);
53            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, average);
54          }
[10612]55        }
56      });
[10249]57    }
[10193]58
[10590]59    public void ReplaceIndicesByMedianValue(Dictionary<int, List<int>> cells) {
[10612]60      preprocessingData.InTransaction(() => {
61        foreach (var column in cells) {
[10615]62          if (preprocessingData.IsType<double>(column.Key)) {
63            double median = statisticsLogic.GetMedian(column.Key);
64            ReplaceIndicesByValue<double>(column.Key, column.Value, median);
65          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
66            DateTime median = statisticsLogic.GetMedianDateTime(column.Key);
67            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, median);
68          }
[10612]69        }
70      });
[10249]71    }
[10193]72
[10590]73    public void ReplaceIndicesByRandomValue(Dictionary<int, List<int>> cells) {
[10612]74      preprocessingData.InTransaction(() => {
75        Random r = new Random();
[10193]76
[10612]77        foreach (var column in cells) {
[10615]78          if (preprocessingData.IsType<double>(column.Key)) {
79            double max = statisticsLogic.GetMax<double>(column.Key);
80            double min = statisticsLogic.GetMin<double>(column.Key);
81            double randMultiplier = (max - min);
82            foreach (int index in column.Value) {
83              double rand = r.NextDouble() * randMultiplier + min;
84              preprocessingData.SetCell<double>(column.Key, index, rand);
85            }
86          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
87            DateTime min = statisticsLogic.GetMin<DateTime>(column.Key);
88            DateTime max = statisticsLogic.GetMax<DateTime>(column.Key);
89            double randMultiplier = (max - min).TotalSeconds;
90            foreach (int index in column.Value) {
91              double rand = r.NextDouble() * randMultiplier;
92              preprocessingData.SetCell<DateTime>(column.Key, index, min.AddSeconds(rand));
93            }
[10612]94          }
[10590]95        }
[10612]96      });
[10249]97    }
[10193]98
[10590]99    public void ReplaceIndicesByLinearInterpolationOfNeighbours(Dictionary<int, List<int>> cells) {
[10612]100      preprocessingData.InTransaction(() => {
101        foreach (var column in cells) {
[10615]102          if (preprocessingData.IsType<double>(column.Key)) {
103            int countValues = preprocessingData.GetValues<double>(column.Key).Count();
104            foreach (int index in column.Value) {
105              // dont replace first or last values
106              if (index > 0 && index < countValues) {
107                int prevIndex = indexOfPrevPresentValue(column.Key, index);
108                int nextIndex = indexOfNextPresentValue(column.Key, index);
[10193]109
[10615]110                // no neighbours found
111                if (prevIndex < 0 && nextIndex >= countValues) {
112                  continue;
113                }
114                double prev = preprocessingData.GetCell<double>(column.Key, prevIndex);
115                double next = preprocessingData.GetCell<double>(column.Key, nextIndex);
116
117                int valuesToInterpolate = nextIndex - prevIndex;
118
119                double interpolationStep = (next - prev) / valuesToInterpolate;
120
121                for (int i = prevIndex; i < nextIndex; ++i) {
122                  double interpolated = prev + (interpolationStep * (i - prevIndex));
123                  preprocessingData.SetCell<double>(column.Key, i, interpolated);
124                }
[10612]125              }
[10615]126            }
127          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
128            int countValues = preprocessingData.GetValues<DateTime>(column.Key).Count();
129            foreach (int index in column.Value) {
130              // dont replace first or last values
131              if (index > 0 && index < countValues) {
132                int prevIndex = indexOfPrevPresentValue(column.Key, index);
133                int nextIndex = indexOfNextPresentValue(column.Key, index);
[10193]134
[10615]135                // no neighbours found
136                if (prevIndex < 0 && nextIndex >= countValues) {
137                  continue;
138                }
139                DateTime prev = preprocessingData.GetCell<DateTime>(column.Key, prevIndex);
140                DateTime next = preprocessingData.GetCell<DateTime>(column.Key, nextIndex);
[10234]141
[10615]142                int valuesToInterpolate = nextIndex - prevIndex;
[10234]143
[10615]144                double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
145
146                for (int i = prevIndex; i < nextIndex; ++i) {
147                  DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
148                  preprocessingData.SetCell<DateTime>(column.Key, i, interpolated);
149                }
[10612]150              }
[10590]151            }
[10249]152          }
[10193]153        }
[10612]154      });
[10249]155    }
[10193]156
[10367]157    private int indexOfPrevPresentValue(int columnIndex, int start) {
[10249]158      int offset = start - 1;
[10367]159      while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]160        offset--;
161      }
[10234]162
[10249]163      return offset;
164    }
[10234]165
[10367]166    private int indexOfNextPresentValue(int columnIndex, int start) {
[10249]167      int offset = start + 1;
[10367]168      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]169        offset++;
170      }
[10234]171
[10249]172      return offset;
173    }
[10234]174
[10590]175    public void ReplaceIndicesByMostCommonValue(Dictionary<int, List<int>> cells) {
[10612]176      preprocessingData.InTransaction(() => {
177        foreach (var column in cells) {
178          if (preprocessingData.IsType<double>(column.Key)) {
[10615]179            ReplaceIndicesByValue<double>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<double>(column.Key));
[10612]180          } else if (preprocessingData.IsType<string>(column.Key)) {
[10615]181            ReplaceIndicesByValue<string>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<string>(column.Key));
[10612]182          } else if (preprocessingData.IsType<DateTime>(column.Key)) {
[10615]183            ReplaceIndicesByValue<DateTime>(column.Key, column.Value, statisticsLogic.GetMostCommonValue<DateTime>(column.Key));
[10612]184          } else {
185            throw new ArgumentException("column with index: " + column.Key + " contains a non supported type.");
186          }
[10590]187        }
[10612]188      });
[10249]189    }
[10218]190
[10249]191    public void ShuffleWithRanges(IEnumerable<IntRange> ranges) {
192      // init random outside loop
193      Random random = new Random();
[10218]194
[10612]195      preprocessingData.InTransaction(() => {
196        // process all given ranges - e.g. TrainingPartition, Trainingpartition
197        foreach (IntRange range in ranges) {
198          List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>();
[10218]199
[10612]200          // generate random indices used for shuffeling each column
201          for (int i = range.End; i > range.Start; --i) {
202            int rand = random.Next(range.Start, i);
203            shuffledIndices.Add(new Tuple<int, int>(i, rand));
204          }
205
206          ReOrderToIndices(shuffledIndices);
[10218]207        }
[10612]208      });
[10253]209    }
210
[10535]211    public void ReOrderToIndices(IEnumerable<int> indices) {
[10256]212      List<Tuple<int, int>> indicesTuple = new List<Tuple<int, int>>();
[10255]213
[10256]214      for (int i = 0; i < indices.Count(); ++i) {
[10311]215        indicesTuple.Add(new Tuple<int, int>(i, indices.ElementAt(i)));
[10256]216      }
217
[10535]218      ReOrderToIndices(indicesTuple);
[10255]219    }
220
[10535]221    public void ReOrderToIndices(IList<System.Tuple<int, int>> indices) {
[10612]222      preprocessingData.InTransaction(() => {
223        for (int i = 0; i < preprocessingData.Columns; ++i) {
224          if (preprocessingData.IsType<double>(i)) {
225            reOrderToIndices<double>(i, indices);
226          } else if (preprocessingData.IsType<string>(i)) {
227            reOrderToIndices<string>(i, indices);
228          } else if (preprocessingData.IsType<DateTime>(i)) {
229            reOrderToIndices<DateTime>(i, indices);
230          }
[10249]231        }
[10612]232      });
[10249]233    }
[10218]234
[10367]235    private void reOrderToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices) {
[10308]236
[10367]237      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(columnIndex));
[10308]238
[10249]239      // process all columns equally
240      foreach (Tuple<int, int> index in indices) {
241        int originalIndex = index.Item1;
242        int replaceIndex = index.Item2;
[10218]243
[10308]244        T replaceValue = originalData.ElementAt<T>(replaceIndex);
[10367]245        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
[10249]246      }
[10193]247    }
[10249]248  }
[10193]249}
Note: See TracBrowser for help on using the repository browser.