Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingDataManipulation.cs @ 10252

Last change on this file since 10252 was 10249, checked in by rstoll, 11 years ago
  • Renamed StatisticInfo to StatisticsLogic
  • Fixed todo in PreprocessingDataManipulation
File size: 5.9 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using HeuristicLab.Data;
5
6namespace HeuristicLab.DataPreprocessing {
7  class PreprocessingDataManipulation : IPreprocessingDataManipulation {
8    private IPreprocessingData preprocessingData;
9    private IStatisticsLogic statisticInfo;
10    private ISearchLogic searchLogic;
11
12    public PreprocessingDataManipulation(IPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic) {
13      preprocessingData = _prepocessingData;
14      searchLogic = theSearchLogic;
15      statisticInfo = theStatisticsLogic;
16    }
17
18    public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value) {
19      foreach (int index in indices) {
20        preprocessingData.SetCell<T>(variableName, index, value);
21      }
22    }
23
24    public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices) {
25      double average = statisticInfo.GetAverage(variableName);
26      ReplaceIndicesByValue<double>(variableName, indices, average);
27    }
28
29    public void ReplaceIndicesByMedianValue(string variableName, IEnumerable<int> indices) {
30      double median = statisticInfo.GetMedian(variableName);
31      ReplaceIndicesByValue<double>(variableName, indices, median);
32    }
33
34    public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices) {
35      Random r = new Random();
36
37      double max = statisticInfo.GetMax<double>(variableName);
38      double min = statisticInfo.GetMin<double>(variableName);
39      double randMultiplier = (max - min);
40      foreach (int index in indices) {
41        double rand = r.NextDouble() * randMultiplier + min;
42        preprocessingData.SetCell<double>(variableName, index, rand);
43      }
44    }
45
46    public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices) {
47      int countValues = preprocessingData.GetValues<double>(variableName).Count();
48      foreach (int index in indices) {
49        // dont replace first or last values
50        if (index > 0 && index < countValues) {
51          int prevIndex = indexOfPrevPresentValue(variableName, index);
52          int nextIndex = indexOfNextPresentValue(variableName, index);
53
54          // no neighbours found
55          if (prevIndex < 0 && nextIndex >= countValues) {
56            continue;
57          }
58          double prev = preprocessingData.GetCell<double>(variableName, prevIndex);
59          double next = preprocessingData.GetCell<double>(variableName, nextIndex);
60
61          int valuesToInterpolate = nextIndex - prevIndex;
62
63          double interpolationStep = (prev + next) / valuesToInterpolate;
64
65          for (int i = prevIndex; i < nextIndex; ++i) {
66            double interpolated = prev + (interpolationStep * (i - prevIndex));
67            preprocessingData.SetCell<double>(variableName, i, interpolated);
68          }
69        }
70      }
71    }
72
73    private int indexOfPrevPresentValue(string variableName, int start) {
74      int offset = start - 1;
75      while (offset >= 0 && searchLogic.IsMissingValue(variableName, offset)) {
76        offset--;
77      }
78
79      return offset;
80    }
81
82    private int indexOfNextPresentValue(string variableName, int start) {
83      int offset = start + 1;
84      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(variableName, offset)) {
85        offset++;
86      }
87
88      return offset;
89    }
90
91    public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices) {
92      if (preprocessingData.IsType<double>(variableName)) {
93        ReplaceIndicesByValue<double>(variableName, indices, statisticInfo.GetMostCommonValue<double>(variableName));
94      } else if (preprocessingData.IsType<string>(variableName)) {
95        ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName));
96      } else if (preprocessingData.IsType<DateTime>(variableName)) {
97        ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName));
98      } else {
99        throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
100      }
101    }
102
103    public void ShuffleWithRanges(IEnumerable<IntRange> ranges) {
104      // init random outside loop
105      Random random = new Random();
106
107      // process all given ranges - e.g. TrainingPartition, Trainingpartition
108      foreach (IntRange range in ranges) {
109        List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>();
110
111        // generate random indices used for shuffeling each column
112        for (int i = range.End; i > range.Start; --i) {
113          int rand = random.Next(range.Start, i);
114          shuffledIndices.Add(new Tuple<int, int>(i, rand));
115        }
116
117        foreach (string variableName in preprocessingData.VariableNames) {
118          if (preprocessingData.IsType<double>(variableName)) {
119            reOrderToIndices<double>(variableName, shuffledIndices);
120          } else if (preprocessingData.IsType<string>(variableName)) {
121            reOrderToIndices<string>(variableName, shuffledIndices);
122          } else if (preprocessingData.IsType<DateTime>(variableName)) {
123            reOrderToIndices<DateTime>(variableName, shuffledIndices);
124          }
125        }
126      }
127    }
128
129    public void reOrderToIndices<T>(string variableName, List<Tuple<int, int>> indices) {
130      // process all columns equally
131      foreach (Tuple<int, int> index in indices) {
132        int originalIndex = index.Item1;
133        int replaceIndex = index.Item2;
134
135        T tmp = preprocessingData.GetCell<T>(variableName, originalIndex);
136        T replaceValue = preprocessingData.GetCell<T>(variableName, replaceIndex);
137
138        preprocessingData.SetCell<T>(variableName, originalIndex, replaceValue);
139        preprocessingData.SetCell<T>(variableName, replaceIndex, tmp);
140      }
141    }
142  }
143}
Note: See TracBrowser for help on using the repository browser.