Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingDataManipulation.cs @ 10236

Last change on this file since 10236 was 10236, checked in by sbreuer, 10 years ago
  • moved missing value functionality to ISearchLogic
  • created IStringConvertibleMatrix implementation and DataGridContentView
File size: 7.0 KB
Line 
1using HeuristicLab.Data;
2using System;
3using System.Collections.Generic;
4using System.Linq;
5using System.Text;
6
7namespace HeuristicLab.DataPreprocessing.Implementations
8{
9    class PreprocessingDataManipulation
10    {
11        private IPreprocessingData preprocessingData;
12        private StatisticInfo statisticInfo;
13
14        public PreprocessingDataManipulation(IPreprocessingData _prepocessingData) {
15            preprocessingData = _prepocessingData;
16          //todo
17            statisticInfo = new StatisticInfo(preprocessingData, new SearchLogic(preprocessingData));
18        }
19
20        public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value)
21        {
22            foreach (int index in indices)
23            {
24                preprocessingData.SetCell<T>(variableName, index, value);
25            }
26        }
27
28        public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices)
29        {
30            double average = statisticInfo.GetAverage(variableName);
31            ReplaceIndicesByValue<double>(variableName, indices, average);
32        }
33
34        public void ReplaceIndicesByMedianValue(string variableName, IEnumerable<int> indices)
35        {
36            double median = statisticInfo.GetMedian(variableName);
37            ReplaceIndicesByValue<double>(variableName, indices, median);
38        }
39
40        public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices)
41        {
42            Random r = new Random();
43
44            double max = statisticInfo.GetMax<double>(variableName);
45            double min = statisticInfo.GetMin<double>(variableName);
46            double randMultiplier = (max - min);
47            foreach (int index in indices)
48            {
49                double rand = r.NextDouble() * randMultiplier + min;
50                preprocessingData.SetCell<double>(variableName, index, rand);
51            }
52        }
53
54        public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices)
55        {
56            int countValues = preprocessingData.GetValues<double>(variableName).Count();
57            foreach (int index in indices)
58            {
59                // dont replace first or last values
60                if (index > 0 && index < countValues)
61                {
62                    int prevIndex = indexOfPrevPresentValue(variableName, index);
63                    int nextIndex = indexOfNextPresentValue(variableName, index);
64
65                    // no neighbours found
66                    if (prevIndex < 0 && nextIndex >= countValues)
67                    {
68                        continue;
69                    }
70                    double prev = preprocessingData.GetCell<double>(variableName, prevIndex);
71                    double next = preprocessingData.GetCell<double>(variableName, nextIndex);
72
73                    int valuesToInterpolate = nextIndex - prevIndex;
74
75                    double interpolationStep = (prev + next) / valuesToInterpolate;
76
77                    for (int i = prevIndex; i < nextIndex; ++i) {
78                        double interpolated = prev + (interpolationStep * (i-prevIndex));
79                        preprocessingData.SetCell<double>(variableName, i, interpolated);
80                    }
81                }
82            }
83        }
84
85        private int indexOfPrevPresentValue(string variableName, int start) {
86            int offset = start - 1;
87            while(offset >= 0 && preprocessingData.IsMissingValue(variableName, offset)){
88                offset--;
89            }
90
91            return offset;
92        }
93
94        private int indexOfNextPresentValue(string variableName, int start)
95        {
96            int offset = start + 1;
97            while (offset < preprocessingData.Rows && preprocessingData.IsMissingValue(variableName,  offset))
98            {
99                offset++;
100            }
101
102            return offset;
103        }
104
105        public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices)
106        {
107            if (preprocessingData.IsType<double>(variableName))
108            {
109                ReplaceIndicesByValue<double>(variableName, indices, statisticInfo.GetMostCommonValue<double>(variableName));
110            }
111            else if (preprocessingData.IsType<string>(variableName))
112            {
113                ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName));
114            }
115            else if (preprocessingData.IsType<DateTime>(variableName))
116            {
117                ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName));
118            }
119            else
120            {
121                throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
122            }
123        }
124
125        public void ShuffleWithRanges(IEnumerable<IntRange> ranges)
126        {
127            // init random outside loop
128            Random random = new Random();
129
130            // process all given ranges - e.g. TrainingPartition, Trainingpartition
131            foreach (IntRange range in ranges) {
132                List<Tuple<int, int>> shuffledIndices = new List<Tuple<int,int>>();
133               
134                // generate random indices used for shuffeling each column
135                for (int i = range.End; i > range.Start; --i)
136                {
137                    int rand = random.Next(range.Start, i);
138                    shuffledIndices.Add(new Tuple<int,int>(i,rand));
139                }
140
141                foreach (string variableName in preprocessingData.VariableNames)
142                {
143                    if (preprocessingData.IsType<double>(variableName))
144                    {
145                        reOrderToIndices<double>(variableName, shuffledIndices);
146                    }
147                    else if (preprocessingData.IsType<string>(variableName))
148                    {
149                        reOrderToIndices<string>(variableName, shuffledIndices);
150                    }
151                    else if (preprocessingData.IsType<DateTime>(variableName))
152                    {
153                        reOrderToIndices<DateTime>(variableName, shuffledIndices);
154                    }
155                }
156            }     
157        }
158
159        public void reOrderToIndices<T>(string variableName, List<Tuple<int, int>> indices) {
160            // process all columns equally
161            foreach(Tuple<int, int> index in indices)
162            {
163                int originalIndex = index.Item1;
164                int replaceIndex = index.Item2;
165
166                T tmp = preprocessingData.GetCell<T>(variableName, originalIndex);
167                T replaceValue = preprocessingData.GetCell<T>(variableName, replaceIndex);
168
169                preprocessingData.SetCell<T>(variableName, originalIndex, replaceValue);
170                preprocessingData.SetCell<T>(variableName, replaceIndex, tmp);
171            }
172        }
173    }
174}
Note: See TracBrowser for help on using the repository browser.