using HeuristicLab.Data; using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace HeuristicLab.DataPreprocessing.Implementations { class PreprocessingDataManipulation { private IPreprocessingData preprocessingData; private StatisticInfo statisticInfo; private ISearchLogic searchLogic; public PreprocessingDataManipulation(IPreprocessingData _prepocessingData) { preprocessingData = _prepocessingData; //todo searchLogic = new SearchLogic(preprocessingData); statisticInfo = new StatisticInfo(preprocessingData,searchLogic); } public void ReplaceIndicesByValue(string variableName, IEnumerable indices, T value) { foreach (int index in indices) { preprocessingData.SetCell(variableName, index, value); } } public void ReplaceIndicesByAverageValue(string variableName, IEnumerable indices) { double average = statisticInfo.GetAverage(variableName); ReplaceIndicesByValue(variableName, indices, average); } public void ReplaceIndicesByMedianValue(string variableName, IEnumerable indices) { double median = statisticInfo.GetMedian(variableName); ReplaceIndicesByValue(variableName, indices, median); } public void ReplaceIndicesByRandomValue(string variableName, IEnumerable indices) { Random r = new Random(); double max = statisticInfo.GetMax(variableName); double min = statisticInfo.GetMin(variableName); double randMultiplier = (max - min); foreach (int index in indices) { double rand = r.NextDouble() * randMultiplier + min; preprocessingData.SetCell(variableName, index, rand); } } public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable indices) { int countValues = preprocessingData.GetValues(variableName).Count(); foreach (int index in indices) { // dont replace first or last values if (index > 0 && index < countValues) { int prevIndex = indexOfPrevPresentValue(variableName, index); int nextIndex = indexOfNextPresentValue(variableName, index); // no neighbours found if (prevIndex < 0 && nextIndex >= countValues) { continue; } double prev = preprocessingData.GetCell(variableName, prevIndex); double next = preprocessingData.GetCell(variableName, nextIndex); int valuesToInterpolate = nextIndex - prevIndex; double interpolationStep = (prev + next) / valuesToInterpolate; for (int i = prevIndex; i < nextIndex; ++i) { double interpolated = prev + (interpolationStep * (i-prevIndex)); preprocessingData.SetCell(variableName, i, interpolated); } } } } private int indexOfPrevPresentValue(string variableName, int start) { int offset = start - 1; while (offset >= 0 && searchLogic.IsMissingValue(variableName, offset)) { offset--; } return offset; } private int indexOfNextPresentValue(string variableName, int start) { int offset = start + 1; while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(variableName, offset)) { offset++; } return offset; } public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable indices) { if (preprocessingData.IsType(variableName)) { ReplaceIndicesByValue(variableName, indices, statisticInfo.GetMostCommonValue(variableName)); } else if (preprocessingData.IsType(variableName)) { ReplaceIndicesByValue(variableName, indices, statisticInfo.GetMostCommonValue(variableName)); } else if (preprocessingData.IsType(variableName)) { ReplaceIndicesByValue(variableName, indices, statisticInfo.GetMostCommonValue(variableName)); } else { throw new ArgumentException("column with index: " + variableName + " contains a non supported type."); } } public void ShuffleWithRanges(IEnumerable ranges) { // init random outside loop Random random = new Random(); // process all given ranges - e.g. TrainingPartition, Trainingpartition foreach (IntRange range in ranges) { List> shuffledIndices = new List>(); // generate random indices used for shuffeling each column for (int i = range.End; i > range.Start; --i) { int rand = random.Next(range.Start, i); shuffledIndices.Add(new Tuple(i,rand)); } foreach (string variableName in preprocessingData.VariableNames) { if (preprocessingData.IsType(variableName)) { reOrderToIndices(variableName, shuffledIndices); } else if (preprocessingData.IsType(variableName)) { reOrderToIndices(variableName, shuffledIndices); } else if (preprocessingData.IsType(variableName)) { reOrderToIndices(variableName, shuffledIndices); } } } } public void reOrderToIndices(string variableName, List> indices) { // process all columns equally foreach(Tuple index in indices) { int originalIndex = index.Item1; int replaceIndex = index.Item2; T tmp = preprocessingData.GetCell(variableName, originalIndex); T replaceValue = preprocessingData.GetCell(variableName, replaceIndex); preprocessingData.SetCell(variableName, originalIndex, replaceValue); preprocessingData.SetCell(variableName, replaceIndex, tmp); } } } }