Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingDataManipulation.cs @ 10238

Last change on this file since 10238 was 10238, checked in by sbreuer, 11 years ago

MissingValue fix

File size: 7.1 KB
Line 
1using HeuristicLab.Data;
2using System;
3using System.Collections.Generic;
4using System.Linq;
5using System.Text;
6
7namespace HeuristicLab.DataPreprocessing.Implementations
8{
9    class PreprocessingDataManipulation
10    {
11        private IPreprocessingData preprocessingData;
12        private StatisticInfo statisticInfo;
13        private ISearchLogic searchLogic;
14
15        public PreprocessingDataManipulation(IPreprocessingData _prepocessingData) {
16            preprocessingData = _prepocessingData;
17          //todo
18            searchLogic = new SearchLogic(preprocessingData);
19            statisticInfo = new StatisticInfo(preprocessingData,searchLogic);
20           
21        }
22
23        public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value)
24        {
25            foreach (int index in indices)
26            {
27                preprocessingData.SetCell<T>(variableName, index, value);
28            }
29        }
30
31        public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices)
32        {
33            double average = statisticInfo.GetAverage(variableName);
34            ReplaceIndicesByValue<double>(variableName, indices, average);
35        }
36
37        public void ReplaceIndicesByMedianValue(string variableName, IEnumerable<int> indices)
38        {
39            double median = statisticInfo.GetMedian(variableName);
40            ReplaceIndicesByValue<double>(variableName, indices, median);
41        }
42
43        public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices)
44        {
45            Random r = new Random();
46
47            double max = statisticInfo.GetMax<double>(variableName);
48            double min = statisticInfo.GetMin<double>(variableName);
49            double randMultiplier = (max - min);
50            foreach (int index in indices)
51            {
52                double rand = r.NextDouble() * randMultiplier + min;
53                preprocessingData.SetCell<double>(variableName, index, rand);
54            }
55        }
56
57        public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices)
58        {
59            int countValues = preprocessingData.GetValues<double>(variableName).Count();
60            foreach (int index in indices)
61            {
62                // dont replace first or last values
63                if (index > 0 && index < countValues)
64                {
65                    int prevIndex = indexOfPrevPresentValue(variableName, index);
66                    int nextIndex = indexOfNextPresentValue(variableName, index);
67
68                    // no neighbours found
69                    if (prevIndex < 0 && nextIndex >= countValues)
70                    {
71                        continue;
72                    }
73                    double prev = preprocessingData.GetCell<double>(variableName, prevIndex);
74                    double next = preprocessingData.GetCell<double>(variableName, nextIndex);
75
76                    int valuesToInterpolate = nextIndex - prevIndex;
77
78                    double interpolationStep = (prev + next) / valuesToInterpolate;
79
80                    for (int i = prevIndex; i < nextIndex; ++i) {
81                        double interpolated = prev + (interpolationStep * (i-prevIndex));
82                        preprocessingData.SetCell<double>(variableName, i, interpolated);
83                    }
84                }
85            }
86        }
87
88        private int indexOfPrevPresentValue(string variableName, int start) {
89            int offset = start - 1;
90            while (offset >= 0 && searchLogic.IsMissingValue(variableName, offset)) {
91                offset--;
92            }
93
94            return offset;
95        }
96
97        private int indexOfNextPresentValue(string variableName, int start)
98        {
99            int offset = start + 1;
100            while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(variableName, offset))
101            {
102                offset++;
103            }
104
105            return offset;
106        }
107
108        public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices)
109        {
110            if (preprocessingData.IsType<double>(variableName))
111            {
112                ReplaceIndicesByValue<double>(variableName, indices, statisticInfo.GetMostCommonValue<double>(variableName));
113            }
114            else if (preprocessingData.IsType<string>(variableName))
115            {
116                ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName));
117            }
118            else if (preprocessingData.IsType<DateTime>(variableName))
119            {
120                ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName));
121            }
122            else
123            {
124                throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
125            }
126        }
127
128        public void ShuffleWithRanges(IEnumerable<IntRange> ranges)
129        {
130            // init random outside loop
131            Random random = new Random();
132
133            // process all given ranges - e.g. TrainingPartition, Trainingpartition
134            foreach (IntRange range in ranges) {
135                List<Tuple<int, int>> shuffledIndices = new List<Tuple<int,int>>();
136               
137                // generate random indices used for shuffeling each column
138                for (int i = range.End; i > range.Start; --i)
139                {
140                    int rand = random.Next(range.Start, i);
141                    shuffledIndices.Add(new Tuple<int,int>(i,rand));
142                }
143
144                foreach (string variableName in preprocessingData.VariableNames)
145                {
146                    if (preprocessingData.IsType<double>(variableName))
147                    {
148                        reOrderToIndices<double>(variableName, shuffledIndices);
149                    }
150                    else if (preprocessingData.IsType<string>(variableName))
151                    {
152                        reOrderToIndices<string>(variableName, shuffledIndices);
153                    }
154                    else if (preprocessingData.IsType<DateTime>(variableName))
155                    {
156                        reOrderToIndices<DateTime>(variableName, shuffledIndices);
157                    }
158                }
159            }     
160        }
161
162        public void reOrderToIndices<T>(string variableName, List<Tuple<int, int>> indices) {
163            // process all columns equally
164            foreach(Tuple<int, int> index in indices)
165            {
166                int originalIndex = index.Item1;
167                int replaceIndex = index.Item2;
168
169                T tmp = preprocessingData.GetCell<T>(variableName, originalIndex);
170                T replaceValue = preprocessingData.GetCell<T>(variableName, replaceIndex);
171
172                preprocessingData.SetCell<T>(variableName, originalIndex, replaceValue);
173                preprocessingData.SetCell<T>(variableName, replaceIndex, tmp);
174            }
175        }
176    }
177}
Note: See TracBrowser for help on using the repository browser.