Changeset 10249 for branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingDataManipulation.cs
- Timestamp:
- 12/18/13 15:36:18 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingDataManipulation.cs
r10246 r10249 1 using HeuristicLab.Data; 2 using System; 1 using System; 3 2 using System.Collections.Generic; 4 3 using System.Linq; 5 using System.Text;4 using HeuristicLab.Data; 6 5 7 namespace HeuristicLab.DataPreprocessing 8 { 9 class PreprocessingDataManipulation : IPreprocessingDataManipulation 10 { 11 private IPreprocessingData preprocessingData; 12 private StatisticInfo statisticInfo; 13 private ISearchLogic searchLogic; 6 namespace HeuristicLab.DataPreprocessing { 7 class PreprocessingDataManipulation : IPreprocessingDataManipulation { 8 private IPreprocessingData preprocessingData; 9 private IStatisticsLogic statisticInfo; 10 private ISearchLogic searchLogic; 14 11 15 public PreprocessingDataManipulation(IPreprocessingData _prepocessingData) { 16 preprocessingData = _prepocessingData; 17 //todo 18 searchLogic = new SearchLogic(preprocessingData); 19 statisticInfo = new StatisticInfo(preprocessingData,searchLogic); 20 12 public PreprocessingDataManipulation(IPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic) { 13 preprocessingData = _prepocessingData; 14 searchLogic = theSearchLogic; 15 statisticInfo = theStatisticsLogic; 16 } 17 18 public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value) { 19 foreach (int index in indices) { 20 preprocessingData.SetCell<T>(variableName, index, value); 21 } 22 } 23 24 public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices) { 25 double average = statisticInfo.GetAverage(variableName); 26 ReplaceIndicesByValue<double>(variableName, indices, average); 27 } 28 29 public void ReplaceIndicesByMedianValue(string variableName, IEnumerable<int> indices) { 30 double median = statisticInfo.GetMedian(variableName); 31 ReplaceIndicesByValue<double>(variableName, indices, median); 32 } 33 34 public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices) { 35 Random r = new Random(); 36 37 double max = statisticInfo.GetMax<double>(variableName); 38 double min = statisticInfo.GetMin<double>(variableName); 39 double randMultiplier = (max - min); 40 foreach (int index in indices) { 41 double rand = r.NextDouble() * randMultiplier + min; 42 preprocessingData.SetCell<double>(variableName, index, rand); 43 } 44 } 45 46 public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices) { 47 int countValues = preprocessingData.GetValues<double>(variableName).Count(); 48 foreach (int index in indices) { 49 // dont replace first or last values 50 if (index > 0 && index < countValues) { 51 int prevIndex = indexOfPrevPresentValue(variableName, index); 52 int nextIndex = indexOfNextPresentValue(variableName, index); 53 54 // no neighbours found 55 if (prevIndex < 0 && nextIndex >= countValues) { 56 continue; 57 } 58 double prev = preprocessingData.GetCell<double>(variableName, prevIndex); 59 double next = preprocessingData.GetCell<double>(variableName, nextIndex); 60 61 int valuesToInterpolate = nextIndex - prevIndex; 62 63 double interpolationStep = (prev + next) / valuesToInterpolate; 64 65 for (int i = prevIndex; i < nextIndex; ++i) { 66 double interpolated = prev + (interpolationStep * (i - prevIndex)); 67 preprocessingData.SetCell<double>(variableName, i, interpolated); 68 } 69 } 70 } 71 } 72 73 private int indexOfPrevPresentValue(string variableName, int start) { 74 int offset = start - 1; 75 while (offset >= 0 && searchLogic.IsMissingValue(variableName, offset)) { 76 offset--; 77 } 78 79 return offset; 80 } 81 82 private int indexOfNextPresentValue(string variableName, int start) { 83 int offset = start + 1; 84 while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(variableName, offset)) { 85 offset++; 86 } 87 88 return offset; 89 } 90 91 public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices) { 92 if (preprocessingData.IsType<double>(variableName)) { 93 ReplaceIndicesByValue<double>(variableName, indices, statisticInfo.GetMostCommonValue<double>(variableName)); 94 } else if (preprocessingData.IsType<string>(variableName)) { 95 ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName)); 96 } else if (preprocessingData.IsType<DateTime>(variableName)) { 97 ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName)); 98 } else { 99 throw new ArgumentException("column with index: " + variableName + " contains a non supported type."); 100 } 101 } 102 103 public void ShuffleWithRanges(IEnumerable<IntRange> ranges) { 104 // init random outside loop 105 Random random = new Random(); 106 107 // process all given ranges - e.g. TrainingPartition, Trainingpartition 108 foreach (IntRange range in ranges) { 109 List<Tuple<int, int>> shuffledIndices = new List<Tuple<int, int>>(); 110 111 // generate random indices used for shuffeling each column 112 for (int i = range.End; i > range.Start; --i) { 113 int rand = random.Next(range.Start, i); 114 shuffledIndices.Add(new Tuple<int, int>(i, rand)); 21 115 } 22 116 23 public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value) 24 { 25 foreach (int index in indices) 26 { 27 preprocessingData.SetCell<T>(variableName, index, value); 28 } 117 foreach (string variableName in preprocessingData.VariableNames) { 118 if (preprocessingData.IsType<double>(variableName)) { 119 reOrderToIndices<double>(variableName, shuffledIndices); 120 } else if (preprocessingData.IsType<string>(variableName)) { 121 reOrderToIndices<string>(variableName, shuffledIndices); 122 } else if (preprocessingData.IsType<DateTime>(variableName)) { 123 reOrderToIndices<DateTime>(variableName, shuffledIndices); 124 } 29 125 } 126 } 127 } 30 128 31 public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices)32 {33 double average = statisticInfo.GetAverage(variableName);34 ReplaceIndicesByValue<double>(variableName, indices, average);35 }129 public void reOrderToIndices<T>(string variableName, List<Tuple<int, int>> indices) { 130 // process all columns equally 131 foreach (Tuple<int, int> index in indices) { 132 int originalIndex = index.Item1; 133 int replaceIndex = index.Item2; 36 134 37 public void ReplaceIndicesByMedianValue(string variableName, IEnumerable<int> indices) 38 { 39 double median = statisticInfo.GetMedian(variableName); 40 ReplaceIndicesByValue<double>(variableName, indices, median); 41 } 135 T tmp = preprocessingData.GetCell<T>(variableName, originalIndex); 136 T replaceValue = preprocessingData.GetCell<T>(variableName, replaceIndex); 42 137 43 public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices) 44 { 45 Random r = new Random(); 46 47 double max = statisticInfo.GetMax<double>(variableName); 48 double min = statisticInfo.GetMin<double>(variableName); 49 double randMultiplier = (max - min); 50 foreach (int index in indices) 51 { 52 double rand = r.NextDouble() * randMultiplier + min; 53 preprocessingData.SetCell<double>(variableName, index, rand); 54 } 55 } 56 57 public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices) 58 { 59 int countValues = preprocessingData.GetValues<double>(variableName).Count(); 60 foreach (int index in indices) 61 { 62 // dont replace first or last values 63 if (index > 0 && index < countValues) 64 { 65 int prevIndex = indexOfPrevPresentValue(variableName, index); 66 int nextIndex = indexOfNextPresentValue(variableName, index); 67 68 // no neighbours found 69 if (prevIndex < 0 && nextIndex >= countValues) 70 { 71 continue; 72 } 73 double prev = preprocessingData.GetCell<double>(variableName, prevIndex); 74 double next = preprocessingData.GetCell<double>(variableName, nextIndex); 75 76 int valuesToInterpolate = nextIndex - prevIndex; 77 78 double interpolationStep = (prev + next) / valuesToInterpolate; 79 80 for (int i = prevIndex; i < nextIndex; ++i) { 81 double interpolated = prev + (interpolationStep * (i-prevIndex)); 82 preprocessingData.SetCell<double>(variableName, i, interpolated); 83 } 84 } 85 } 86 } 87 88 private int indexOfPrevPresentValue(string variableName, int start) { 89 int offset = start - 1; 90 while (offset >= 0 && searchLogic.IsMissingValue(variableName, offset)) { 91 offset--; 92 } 93 94 return offset; 95 } 96 97 private int indexOfNextPresentValue(string variableName, int start) 98 { 99 int offset = start + 1; 100 while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(variableName, offset)) 101 { 102 offset++; 103 } 104 105 return offset; 106 } 107 108 public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices) 109 { 110 if (preprocessingData.IsType<double>(variableName)) 111 { 112 ReplaceIndicesByValue<double>(variableName, indices, statisticInfo.GetMostCommonValue<double>(variableName)); 113 } 114 else if (preprocessingData.IsType<string>(variableName)) 115 { 116 ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName)); 117 } 118 else if (preprocessingData.IsType<DateTime>(variableName)) 119 { 120 ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName)); 121 } 122 else 123 { 124 throw new ArgumentException("column with index: " + variableName + " contains a non supported type."); 125 } 126 } 127 128 public void ShuffleWithRanges(IEnumerable<IntRange> ranges) 129 { 130 // init random outside loop 131 Random random = new Random(); 132 133 // process all given ranges - e.g. TrainingPartition, Trainingpartition 134 foreach (IntRange range in ranges) { 135 List<Tuple<int, int>> shuffledIndices = new List<Tuple<int,int>>(); 136 137 // generate random indices used for shuffeling each column 138 for (int i = range.End; i > range.Start; --i) 139 { 140 int rand = random.Next(range.Start, i); 141 shuffledIndices.Add(new Tuple<int,int>(i,rand)); 142 } 143 144 foreach (string variableName in preprocessingData.VariableNames) 145 { 146 if (preprocessingData.IsType<double>(variableName)) 147 { 148 reOrderToIndices<double>(variableName, shuffledIndices); 149 } 150 else if (preprocessingData.IsType<string>(variableName)) 151 { 152 reOrderToIndices<string>(variableName, shuffledIndices); 153 } 154 else if (preprocessingData.IsType<DateTime>(variableName)) 155 { 156 reOrderToIndices<DateTime>(variableName, shuffledIndices); 157 } 158 } 159 } 160 } 161 162 public void reOrderToIndices<T>(string variableName, List<Tuple<int, int>> indices) { 163 // process all columns equally 164 foreach(Tuple<int, int> index in indices) 165 { 166 int originalIndex = index.Item1; 167 int replaceIndex = index.Item2; 168 169 T tmp = preprocessingData.GetCell<T>(variableName, originalIndex); 170 T replaceValue = preprocessingData.GetCell<T>(variableName, replaceIndex); 171 172 preprocessingData.SetCell<T>(variableName, originalIndex, replaceValue); 173 preprocessingData.SetCell<T>(variableName, replaceIndex, tmp); 174 } 175 } 138 preprocessingData.SetCell<T>(variableName, originalIndex, replaceValue); 139 preprocessingData.SetCell<T>(variableName, replaceIndex, tmp); 140 } 176 141 } 142 } 177 143 }
Note: See TracChangeset
for help on using the changeset viewer.