source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingDataManipulation.cs @ 10311

Last change on this file since 10311 was 10311, checked in by rstoll, 6 years ago
  • tuples for sorting were not added to tuples list - bug fixed
  • Standard view for DataGridContenView changed
  • changed SetValues to IList (was IEnumerable) before
File size: 6.3 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using HeuristicLab.Data;
5using System.Collections;
6
7namespace HeuristicLab.DataPreprocessing {
8  public class PreprocessingDataManipulation : IPreprocessingDataManipulation {
9    private IPreprocessingData preprocessingData;
10    private IStatisticsLogic statisticInfo;
11    private ISearchLogic searchLogic;
12
13    public PreprocessingDataManipulation(IPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic) {
14      preprocessingData = _prepocessingData;
15      searchLogic = theSearchLogic;
16      statisticInfo = theStatisticsLogic;
17    }
18
19    public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value) {
20      foreach (int index in indices) {
21        preprocessingData.SetCell<T>(variableName, index, value);
22      }
23    }
24
25    public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices) {
26      double average = statisticInfo.GetAverage(variableName);
27      ReplaceIndicesByValue<double>(variableName, indices, average);
28    }
29
30    public void ReplaceIndicesByMedianValue(string variableName, IEnumerable<int> indices) {
31      double median = statisticInfo.GetMedian(variableName);
32      ReplaceIndicesByValue<double>(variableName, indices, median);
33    }
34
35    public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices) {
36      Random r = new Random();
37
38      double max = statisticInfo.GetMax<double>(variableName);
39      double min = statisticInfo.GetMin<double>(variableName);
40      double randMultiplier = (max - min);
41      foreach (int index in indices) {
42        double rand = r.NextDouble() * randMultiplier + min;
43        preprocessingData.SetCell<double>(variableName, index, rand);
44      }
45    }
46
47    public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices) {
48      int countValues = preprocessingData.GetValues<double>(variableName).Count();
49      foreach (int index in indices) {
50        // dont replace first or last values
51        if (index > 0 && index < countValues) {
52          int prevIndex = indexOfPrevPresentValue(variableName, index);
53          int nextIndex = indexOfNextPresentValue(variableName, index);
54
55          // no neighbours found
56          if (prevIndex < 0 && nextIndex >= countValues) {
57            continue;
58          }
59          double prev = preprocessingData.GetCell<double>(variableName, prevIndex);
60          double next = preprocessingData.GetCell<double>(variableName, nextIndex);
61
62          int valuesToInterpolate = nextIndex - prevIndex;
63
64          double interpolationStep = (prev + next) / valuesToInterpolate;
65
66          for (int i = prevIndex; i < nextIndex; ++i) {
67            double interpolated = prev + (interpolationStep * (i - prevIndex));
68            preprocessingData.SetCell<double>(variableName, i, interpolated);
69          }
70        }
71      }
72    }
73
74    private int indexOfPrevPresentValue(string variableName, int start) {
75      int offset = start - 1;
76      while (offset >= 0 && searchLogic.IsMissingValue(variableName, offset)) {
77        offset--;
78      }
79
80      return offset;
81    }
82
83    private int indexOfNextPresentValue(string variableName, int start) {
84      int offset = start + 1;
85      while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(variableName, offset)) {
86        offset++;
87      }
88
89      return offset;
90    }
91
92    public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices) {
93      if (preprocessingData.IsType<double>(variableName)) {
94        ReplaceIndicesByValue<double>(variableName, indices, statisticInfo.GetMostCommonValue<double>(variableName));
95      } else if (preprocessingData.IsType<string>(variableName)) {
96        ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName));
97      } else if (preprocessingData.IsType<DateTime>(variableName)) {
98        ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName));
99      } else {
100        throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
101      }
102    }
103
104    public void ShuffleWithRanges(IEnumerable<IntRange> ranges) {
105      // init random outside loop
106      Random random = new Random();
107
108      // process all given ranges - e.g. TrainingPartition, Trainingpartition
109      foreach (IntRange range in ranges) {
110        List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>();
111
112        // generate random indices used for shuffeling each column
113        for (int i = range.End; i > range.Start; --i) {
114          int rand = random.Next(range.Start, i);
115          shuffledIndices.Add(new Tuple<int, int>(i, rand));
116        }
117
118        reOrderToIndices(shuffledIndices);
119      }
120    }
121
122    public void reOrderToIndices(IEnumerable<int> indices) {
123      List<Tuple<int, int>> indicesTuple = new List<Tuple<int, int>>();
124
125      for (int i = 0; i < indices.Count(); ++i) {
126        indicesTuple.Add(new Tuple<int, int>(i, indices.ElementAt(i)));
127      }
128
129      reOrderToIndices(indicesTuple);
130    }
131
132    public void reOrderToIndices(IList<System.Tuple<int, int>> indices) {
133      foreach (string variableName in preprocessingData.VariableNames) {
134        if (preprocessingData.IsType<double>(variableName)) {
135          reOrderToIndices<double>(variableName, indices);
136        } else if (preprocessingData.IsType<string>(variableName)) {
137          reOrderToIndices<string>(variableName, indices);
138        } else if (preprocessingData.IsType<DateTime>(variableName)) {
139          reOrderToIndices<DateTime>(variableName, indices);
140        }
141      }
142    }
143
144    private void reOrderToIndices<T>(string variableName, IList<Tuple<int, int>> indices) {
145
146      List<T> originalData = new List<T>(preprocessingData.GetValues<T>(variableName));
147
148      // process all columns equally
149      foreach (Tuple<int, int> index in indices) {
150        int originalIndex = index.Item1;
151        int replaceIndex = index.Item2;
152
153        T replaceValue = originalData.ElementAt<T>(replaceIndex);
154        preprocessingData.SetCell<T>(variableName, originalIndex, replaceValue);
155      }
156    }
157  }
158}
Note: See TracBrowser for help on using the repository browser.