using System; using System.Collections.Generic; using System.Linq; using HeuristicLab.Data; namespace HeuristicLab.DataPreprocessing { public class PreprocessingDataManipulation : IPreprocessingDataManipulation { private IPreprocessingData preprocessingData; private IStatisticsLogic statisticInfo; private ISearchLogic searchLogic; public PreprocessingDataManipulation(IPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic) { preprocessingData = _prepocessingData; searchLogic = theSearchLogic; statisticInfo = theStatisticsLogic; } public void ReplaceIndicesByValue(int columnIndex, IEnumerable rowIndices, T value) { foreach (int index in rowIndices) { preprocessingData.SetCell(columnIndex, index, value); } } public void ReplaceIndicesByAverageValue(int columnIndex, IEnumerable rowIndices) { double average = statisticInfo.GetAverage(columnIndex); ReplaceIndicesByValue(columnIndex, rowIndices, average); } public void ReplaceIndicesByMedianValue(int columnIndex, IEnumerable rowIndices) { double median = statisticInfo.GetMedian(columnIndex); ReplaceIndicesByValue(columnIndex, rowIndices, median); } public void ReplaceIndicesByRandomValue(int columnIndex, IEnumerable rowIndices) { Random r = new Random(); double max = statisticInfo.GetMax(columnIndex); double min = statisticInfo.GetMin(columnIndex); double randMultiplier = (max - min); foreach (int index in rowIndices) { double rand = r.NextDouble() * randMultiplier + min; preprocessingData.SetCell(columnIndex, index, rand); } } public void ReplaceIndicesByLinearInterpolationOfNeighbours(int columnIndex, IEnumerable rowIndices) { int countValues = preprocessingData.GetValues(columnIndex).Count(); foreach (int index in rowIndices) { // dont replace first or last values if (index > 0 && index < countValues) { int prevIndex = indexOfPrevPresentValue(columnIndex, index); int nextIndex = indexOfNextPresentValue(columnIndex, index); // no neighbours found if (prevIndex < 0 && nextIndex >= countValues) { continue; } double prev = preprocessingData.GetCell(columnIndex, prevIndex); double next = preprocessingData.GetCell(columnIndex, nextIndex); int valuesToInterpolate = nextIndex - prevIndex; double interpolationStep = (prev + next) / valuesToInterpolate; for (int i = prevIndex; i < nextIndex; ++i) { double interpolated = prev + (interpolationStep * (i - prevIndex)); preprocessingData.SetCell(columnIndex, i, interpolated); } } } } private int indexOfPrevPresentValue(int columnIndex, int start) { int offset = start - 1; while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) { offset--; } return offset; } private int indexOfNextPresentValue(int columnIndex, int start) { int offset = start + 1; while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) { offset++; } return offset; } public void ReplaceIndicesByMostCommonValue(int columnIndex, IEnumerable rowIndices) { if (preprocessingData.IsType(columnIndex)) { ReplaceIndicesByValue(columnIndex, rowIndices, statisticInfo.GetMostCommonValue(columnIndex)); } else if (preprocessingData.IsType(columnIndex)) { ReplaceIndicesByValue(columnIndex, rowIndices, statisticInfo.GetMostCommonValue(columnIndex)); } else if (preprocessingData.IsType(columnIndex)) { ReplaceIndicesByValue(columnIndex, rowIndices, statisticInfo.GetMostCommonValue(columnIndex)); } else { throw new ArgumentException("column with index: " + columnIndex + " contains a non supported type."); } } public void ShuffleWithRanges(IEnumerable ranges) { // init random outside loop Random random = new Random(); // process all given ranges - e.g. TrainingPartition, Trainingpartition foreach (IntRange range in ranges) { List> shuffledIndices = new List>(); // generate random indices used for shuffeling each column for (int i = range.End; i > range.Start; --i) { int rand = random.Next(range.Start, i); shuffledIndices.Add(new Tuple(i, rand)); } reOrderToIndices(shuffledIndices); } } public void reOrderToIndices(IEnumerable indices) { List> indicesTuple = new List>(); for (int i = 0; i < indices.Count(); ++i) { indicesTuple.Add(new Tuple(i, indices.ElementAt(i))); } reOrderToIndices(indicesTuple); } public void reOrderToIndices(IList> indices) { for (int i = 0; i < preprocessingData.Columns; ++i) { if (preprocessingData.IsType(i)) { reOrderToIndices(i, indices); } else if (preprocessingData.IsType(i)) { reOrderToIndices(i, indices); } else if (preprocessingData.IsType(i)) { reOrderToIndices(i, indices); } } } private void reOrderToIndices(int columnIndex, IList> indices) { List originalData = new List(preprocessingData.GetValues(columnIndex)); // process all columns equally foreach (Tuple index in indices) { int originalIndex = index.Item1; int replaceIndex = index.Item2; T replaceValue = originalData.ElementAt(replaceIndex); preprocessingData.SetCell(columnIndex, originalIndex, replaceValue); } } } }