Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingDataManipulation.cs @ 10235

Last change on this file since 10235 was 10234, checked in by mleitner, 11 years ago

Fix linear interpolation

File size: 7.0 KB
Line 
1using HeuristicLab.Data;
2using System;
3using System.Collections.Generic;
4using System.Linq;
5using System.Text;
6
7namespace HeuristicLab.DataPreprocessing.Implementations
8{
9    class PreprocessingDataManipulation
10    {
11        private IPreprocessingData preprocessingData;
12        private StatisticInfo statisticInfo;
13
14        public PreprocessingDataManipulation(IPreprocessingData _prepocessingData) {
15            preprocessingData = _prepocessingData;
16            statisticInfo = new StatisticInfo(preprocessingData);
17        }
18
19        public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value)
20        {
21            foreach (int index in indices)
22            {
23                preprocessingData.SetCell<T>(variableName, index, value);
24            }
25        }
26
27        public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices)
28        {
29            double average = statisticInfo.GetAverage(variableName);
30            ReplaceIndicesByValue<double>(variableName, indices, average);
31        }
32
33        public void ReplaceIndicesByMedianValue(string variableName, IEnumerable<int> indices)
34        {
35            double median = statisticInfo.GetMedian(variableName);
36            ReplaceIndicesByValue<double>(variableName, indices, median);
37        }
38
39        public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices)
40        {
41            Random r = new Random();
42
43            double max = statisticInfo.GetMax<double>(variableName);
44            double min = statisticInfo.GetMin<double>(variableName);
45            double randMultiplier = (max - min);
46            foreach (int index in indices)
47            {
48                double rand = r.NextDouble() * randMultiplier + min;
49                preprocessingData.SetCell<double>(variableName, index, rand);
50            }
51        }
52
53        public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices)
54        {
55            int countValues = preprocessingData.GetValues<double>(variableName).Count();
56            foreach (int index in indices)
57            {
58                // dont replace first or last values
59                if (index > 0 && index < countValues)
60                {
61                    int prevIndex = indexOfPrevPresentValue(variableName, index);
62                    int nextIndex = indexOfNextPresentValue(variableName, index);
63
64                    // no neighbours found
65                    if (prevIndex < 0 && nextIndex >= countValues)
66                    {
67                        continue;
68                    }
69                    double prev = preprocessingData.GetCell<double>(variableName, prevIndex);
70                    double next = preprocessingData.GetCell<double>(variableName, nextIndex);
71
72                    int valuesToInterpolate = nextIndex - prevIndex;
73
74                    double interpolationStep = (prev + next) / valuesToInterpolate;
75
76                    for (int i = prevIndex; i < nextIndex; ++i) {
77                        double interpolated = prev + (interpolationStep * (i-prevIndex));
78                        preprocessingData.SetCell<double>(variableName, i, interpolated);
79                    }
80                }
81            }
82        }
83
84        private int indexOfPrevPresentValue(string variableName, int start) {
85            int offset = start - 1;
86            while(offset >= 0 && preprocessingData.IsMissingValue(variableName, offset)){
87                offset--;
88            }
89
90            return offset;
91        }
92
93        private int indexOfNextPresentValue(string variableName, int start)
94        {
95            int offset = start + 1;
96            while (offset < preprocessingData.Rows && preprocessingData.IsMissingValue(variableName,  offset))
97            {
98                offset++;
99            }
100
101            return offset;
102        }
103
104        public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices)
105        {
106            if (preprocessingData.IsType<double>(variableName))
107            {
108                ReplaceIndicesByValue<double>(variableName, indices, statisticInfo.GetMostCommonValue<double>(variableName));
109            }
110            else if (preprocessingData.IsType<string>(variableName))
111            {
112                ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName));
113            }
114            else if (preprocessingData.IsType<DateTime>(variableName))
115            {
116                ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName));
117            }
118            else
119            {
120                throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
121            }
122        }
123
124        public void ShuffleWithRanges(IEnumerable<IntRange> ranges)
125        {
126            // init random outside loop
127            Random random = new Random();
128
129            // process all given ranges - e.g. TrainingPartition, Trainingpartition
130            foreach (IntRange range in ranges) {
131                List<Tuple<int, int>> shuffledIndices = new List<Tuple<int,int>>();
132               
133                // generate random indices used for shuffeling each column
134                for (int i = range.End; i > range.Start; --i)
135                {
136                    int rand = random.Next(range.Start, i);
137                    shuffledIndices.Add(new Tuple<int,int>(i,rand));
138                }
139
140                foreach (string variableName in preprocessingData.VariableNames)
141                {
142                    if (preprocessingData.IsType<double>(variableName))
143                    {
144                        reOrderToIndices<double>(variableName, shuffledIndices);
145                    }
146                    else if (preprocessingData.IsType<string>(variableName))
147                    {
148                        reOrderToIndices<string>(variableName, shuffledIndices);
149                    }
150                    else if (preprocessingData.IsType<DateTime>(variableName))
151                    {
152                        reOrderToIndices<DateTime>(variableName, shuffledIndices);
153                    }
154                }
155            }     
156        }
157
158        public void reOrderToIndices<T>(string variableName, List<Tuple<int, int>> indices) {
159            // process all columns equally
160            foreach(Tuple<int, int> index in indices)
161            {
162                int originalIndex = index.Item1;
163                int replaceIndex = index.Item2;
164
165                T tmp = preprocessingData.GetCell<T>(variableName, originalIndex);
166                T replaceValue = preprocessingData.GetCell<T>(variableName, replaceIndex);
167
168                preprocessingData.SetCell<T>(variableName, originalIndex, replaceValue);
169                preprocessingData.SetCell<T>(variableName, replaceIndex, tmp);
170            }
171        }
172    }
173}
Note: See TracBrowser for help on using the repository browser.