using HeuristicLab.Data; using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace HeuristicLab.DataPreprocessing.Implementations { class PreprocessingDataManipulation { private IPreprocessingData preprocessingData; private StatisticInfo statisticInfo; public PreprocessingDataManipulation(IPreprocessingData _prepocessingData) { preprocessingData = _prepocessingData; statisticInfo = new StatisticInfo(preprocessingData); } public void ReplaceIndicesByValue(string variableName, IEnumerable indices, T value) { foreach (int index in indices) { preprocessingData.SetCell(variableName, index, value); } } public void ReplaceIndicesByAverageValue(string variableName, IEnumerable indices) { double average = statisticInfo.GetAverage(variableName); ReplaceIndicesByValue(variableName, indices, average); } public void ReplaceIndicesByMedianValue(string variableName, IEnumerable indices) { double median = statisticInfo.GetMedian(variableName); ReplaceIndicesByValue(variableName, indices, median); } public void ReplaceIndicesByRandomValue(string variableName, IEnumerable indices) { Random r = new Random(); double max = statisticInfo.GetMax(variableName); double min = statisticInfo.GetMin(variableName); double randMultiplier = (max - min); foreach (int index in indices) { double rand = r.NextDouble() * randMultiplier + min; preprocessingData.SetCell(variableName, index, rand); } } public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable indices) { int countValues = preprocessingData.GetValues(variableName).Count(); foreach (int index in indices) { // dont replace first or last values if (index > 0 && index < countValues) { double prev = preprocessingData.GetCell(variableName, index - 1); double next = preprocessingData.GetCell(variableName, index + 1); double interpolated = (prev + next) / 2; preprocessingData.SetCell(variableName, index, interpolated); } } } public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable indices) { if (preprocessingData.IsType(variableName)) { ReplaceIndicesByValue(variableName, indices, statisticInfo.GetMostCommonValue(variableName)); } else if (preprocessingData.IsType(variableName)) { ReplaceIndicesByValue(variableName, indices, statisticInfo.GetMostCommonValue(variableName)); } else if (preprocessingData.IsType(variableName)) { ReplaceIndicesByValue(variableName, indices, statisticInfo.GetMostCommonValue(variableName)); } else { throw new ArgumentException("column with index: " + variableName + " contains a non supported type."); } } public void ShuffleWithRanges(IEnumerable ranges) { // init random outside loop Random random = new Random(); // process all given ranges - e.g. TrainingPartition, Trainingpartition foreach (IntRange range in ranges) { List shuffledIndices = new List(); // generate random indices used for shuffeling each column for (int i = range.End; i > range.Start; --i) { int rand = random.Next(range.Start, i); shuffledIndices[i] = rand; } foreach (string variableName in preprocessingData.VariableNames) { if (preprocessingData.IsType(variableName)) { reOrderToIndices(variableName, shuffledIndices); } else if (preprocessingData.IsType(variableName)) { reOrderToIndices(variableName, shuffledIndices); } else if (preprocessingData.IsType(variableName)) { reOrderToIndices(variableName, shuffledIndices); } } } } public void reOrderToIndices(string variableName, List indices) { // process all columns equally for (int i = 0; i < preprocessingData.Rows; i++) { int replaceIndex = indices[i]; T tmp = preprocessingData.GetCell(variableName, i); T replaceValue = preprocessingData.GetCell(variableName, replaceIndex); preprocessingData.SetCell(variableName, i, replaceValue); preprocessingData.SetCell(variableName, replaceIndex, tmp); } } } }