Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingDataManipulation.cs @ 10367

Last change on this file since 10367 was 10367, checked in by rstoll, 10 years ago
  • modified PreprocessingData, uses columnIndex now instead of variableName (is faster and more convenient), set variabelName based methods to Obsolete
  • Already changed SearchLogic, DataGridLogic, StatisticLogic as well as PreprocessingDataManipulation

*

File size: 6.2 KB
RevLine 
[10249]1using System;
[10193]2using System.Collections.Generic;
3using System.Linq;
[10249]4using HeuristicLab.Data;
[10193]5
[10249]6namespace HeuristicLab.DataPreprocessing {
[10256]7  public class PreprocessingDataManipulation : IPreprocessingDataManipulation {
[10249]8    private IPreprocessingData preprocessingData;
9    private IStatisticsLogic statisticInfo;
10    private ISearchLogic searchLogic;
[10193]11
[10249]12    public PreprocessingDataManipulation(IPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic) {
13      preprocessingData = _prepocessingData;
14      searchLogic = theSearchLogic;
15      statisticInfo = theStatisticsLogic;
16    }
[10193]17
[10367]18    public void ReplaceIndicesByValue<T>(int columnIndex, IEnumerable<int> rowIndices, T value) {
19      foreach (int index in rowIndices) {
20        preprocessingData.SetCell<T>(columnIndex, index, value);
[10249]21      }
22    }
[10193]23
[10367]24    public void ReplaceIndicesByAverageValue(int columnIndex, IEnumerable<int> rowIndices) {
25      double average = statisticInfo.GetAverage(columnIndex);
26      ReplaceIndicesByValue<double>(columnIndex, rowIndices, average);
[10249]27    }
[10193]28
[10367]29    public void ReplaceIndicesByMedianValue(int columnIndex, IEnumerable<int> rowIndices) {
30      double median = statisticInfo.GetMedian(columnIndex);
31      ReplaceIndicesByValue<double>(columnIndex, rowIndices, median);
[10249]32    }
[10193]33
[10367]34    public void ReplaceIndicesByRandomValue(int columnIndex, IEnumerable<int> rowIndices) {
[10249]35      Random r = new Random();
[10193]36
[10367]37      double max = statisticInfo.GetMax<double>(columnIndex);
38      double min = statisticInfo.GetMin<double>(columnIndex);
[10249]39      double randMultiplier = (max - min);
[10367]40      foreach (int index in rowIndices) {
[10249]41        double rand = r.NextDouble() * randMultiplier + min;
[10367]42        preprocessingData.SetCell<double>(columnIndex, index, rand);
[10249]43      }
44    }
[10193]45
[10367]46    public void ReplaceIndicesByLinearInterpolationOfNeighbours(int columnIndex, IEnumerable<int> rowIndices) {
47      int countValues = preprocessingData.GetValues<double>(columnIndex).Count();
48      foreach (int index in rowIndices) {
[10249]49        // dont replace first or last values
50        if (index > 0 && index < countValues) {
[10367]51          int prevIndex = indexOfPrevPresentValue(columnIndex, index);
52          int nextIndex = indexOfNextPresentValue(columnIndex, index);
[10193]53
[10249]54          // no neighbours found
55          if (prevIndex < 0 && nextIndex >= countValues) {
56            continue;
57          }
[10367]58          double prev = preprocessingData.GetCell<double>(columnIndex, prevIndex);
59          double next = preprocessingData.GetCell<double>(columnIndex, nextIndex);
[10193]60
[10249]61          int valuesToInterpolate = nextIndex - prevIndex;
[10234]62
[10249]63          double interpolationStep = (prev + next) / valuesToInterpolate;
[10234]64
[10249]65          for (int i = prevIndex; i < nextIndex; ++i) {
66            double interpolated = prev + (interpolationStep * (i - prevIndex));
[10367]67            preprocessingData.SetCell<double>(columnIndex, i, interpolated);
[10249]68          }
[10193]69        }
[10249]70      }
71    }
[10193]72
[10367]73    private int indexOfPrevPresentValue(int columnIndex, int start) {
[10249]74      int offset = start - 1;
[10367]75      while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]76        offset--;
77      }
[10234]78
[10249]79      return offset;
80    }
[10234]81
[10367]82    private int indexOfNextPresentValue(int columnIndex, int start) {
[10249]83      int offset = start + 1;
[10367]84      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) {
[10249]85        offset++;
86      }
[10234]87
[10249]88      return offset;
89    }
[10234]90
[10367]91    public void ReplaceIndicesByMostCommonValue(int columnIndex, IEnumerable<int> rowIndices) {
92      if (preprocessingData.IsType<double>(columnIndex)) {
93        ReplaceIndicesByValue<double>(columnIndex, rowIndices, statisticInfo.GetMostCommonValue<double>(columnIndex));
94      } else if (preprocessingData.IsType<string>(columnIndex)) {
95        ReplaceIndicesByValue<string>(columnIndex, rowIndices, statisticInfo.GetMostCommonValue<string>(columnIndex));
96      } else if (preprocessingData.IsType<DateTime>(columnIndex)) {
97        ReplaceIndicesByValue<DateTime>(columnIndex, rowIndices, statisticInfo.GetMostCommonValue<DateTime>(columnIndex));
[10249]98      } else {
[10367]99        throw new ArgumentException("column with index: " + columnIndex + " contains a non supported type.");
[10249]100      }
101    }
[10218]102
[10249]103    public void ShuffleWithRanges(IEnumerable<IntRange> ranges) {
104      // init random outside loop
105      Random random = new Random();
[10218]106
[10249]107      // process all given ranges - e.g. TrainingPartition, Trainingpartition
108      foreach (IntRange range in ranges) {
109        List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>();
[10218]110
[10249]111        // generate random indices used for shuffeling each column
112        for (int i = range.End; i > range.Start; --i) {
113          int rand = random.Next(range.Start, i);
114          shuffledIndices.Add(new Tuple<int, int>(i, rand));
[10218]115        }
116
[10256]117        reOrderToIndices(shuffledIndices);
[10253]118      }
119    }
120
[10256]121    public void reOrderToIndices(IEnumerable<int> indices) {
122      List<Tuple<int, int>> indicesTuple = new List<Tuple<int, int>>();
[10255]123
[10256]124      for (int i = 0; i < indices.Count(); ++i) {
[10311]125        indicesTuple.Add(new Tuple<int, int>(i, indices.ElementAt(i)));
[10256]126      }
127
128      reOrderToIndices(indicesTuple);
[10255]129    }
130
[10256]131    public void reOrderToIndices(IList<System.Tuple<int, int>> indices) {
[10367]132      for (int i = 0; i < preprocessingData.Columns; ++i) {
133        if (preprocessingData.IsType<double>(i)) {
134          reOrderToIndices<double>(i, indices);
135        } else if (preprocessingData.IsType<string>(i)) {
136          reOrderToIndices<string>(i, indices);
137        } else if (preprocessingData.IsType<DateTime>(i)) {
138          reOrderToIndices<DateTime>(i, indices);
[10249]139        }
140      }
141    }
[10218]142
[10367]143    private void reOrderToIndices<T>(int columnIndex, IList<Tuple<int, int>> indices) {
[10308]144
[10367]145      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(columnIndex));
[10308]146
[10249]147      // process all columns equally
148      foreach (Tuple<int, int> index in indices) {
149        int originalIndex = index.Item1;
150        int replaceIndex = index.Item2;
[10218]151
[10308]152        T replaceValue = originalData.ElementAt<T>(replaceIndex);
[10367]153        preprocessingData.SetCell<T>(columnIndex, originalIndex, replaceValue);
[10249]154      }
[10193]155    }
[10249]156  }
[10193]157}
Note: See TracBrowser for help on using the repository browser.