#region License Information /* HeuristicLab * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using HeuristicLab.Data; namespace HeuristicLab.DataPreprocessing { public class ManipulationLogic : IManipulationLogic { private ITransactionalPreprocessingData preprocessingData; private IStatisticsLogic statisticsLogic; private ISearchLogic searchLogic; private IDataGridLogic dataGridLogic; public ManipulationLogic(ITransactionalPreprocessingData _prepocessingData, ISearchLogic theSearchLogic, IStatisticsLogic theStatisticsLogic, IDataGridLogic theDataGridLogic) { preprocessingData = _prepocessingData; searchLogic = theSearchLogic; statisticsLogic = theStatisticsLogic; dataGridLogic = theDataGridLogic; } public void ReplaceIndicesByValue(int columnIndex, IEnumerable rowIndices, T value) { foreach (int index in rowIndices) { preprocessingData.SetCell(columnIndex, index, value); } } public void ReplaceIndicesByAverageValue(IDictionary> cells, bool considerSelection) { preprocessingData.InTransaction(() => { foreach (var column in cells) { if (preprocessingData.IsType(column.Key)) { double average = statisticsLogic.GetAverage(column.Key, considerSelection); ReplaceIndicesByValue(column.Key, column.Value, average); } else if (preprocessingData.IsType(column.Key)) { DateTime average = statisticsLogic.GetAverageDateTime(column.Key, considerSelection); ReplaceIndicesByValue(column.Key, column.Value, average); } } }); } public void ReplaceIndicesByMedianValue(IDictionary> cells, bool considerSelection) { preprocessingData.InTransaction(() => { foreach (var column in cells) { if (preprocessingData.IsType(column.Key)) { double median = statisticsLogic.GetMedian(column.Key, considerSelection); ReplaceIndicesByValue(column.Key, column.Value, median); } else if (preprocessingData.IsType(column.Key)) { DateTime median = statisticsLogic.GetMedianDateTime(column.Key, considerSelection); ReplaceIndicesByValue(column.Key, column.Value, median); } } }); } public void ReplaceIndicesByRandomValue(IDictionary> cells, bool considerSelection) { preprocessingData.InTransaction(() => { Random r = new Random(); foreach (var column in cells) { if (preprocessingData.IsType(column.Key)) { double max = statisticsLogic.GetMax(column.Key, considerSelection); double min = statisticsLogic.GetMin(column.Key, considerSelection); double randMultiplier = (max - min); foreach (int index in column.Value) { double rand = r.NextDouble() * randMultiplier + min; preprocessingData.SetCell(column.Key, index, rand); } } else if (preprocessingData.IsType(column.Key)) { DateTime min = statisticsLogic.GetMin(column.Key, considerSelection); DateTime max = statisticsLogic.GetMax(column.Key, considerSelection); double randMultiplier = (max - min).TotalSeconds; foreach (int index in column.Value) { double rand = r.NextDouble() * randMultiplier; preprocessingData.SetCell(column.Key, index, min.AddSeconds(rand)); } } } }); } public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary> cells) { preprocessingData.InTransaction(() => { foreach (var column in cells) { int countValues = 0; if (preprocessingData.IsType(column.Key)) { countValues = preprocessingData.GetValues(column.Key, false).Count(); } else if (preprocessingData.IsType(column.Key)) { countValues = preprocessingData.GetValues(column.Key, false).Count(); } foreach (int index in column.Value) { // dont replace first or last values if (index > 0 && index < countValues) { int prevIndex = indexOfPrevPresentValue(column.Key, index); int nextIndex = indexOfNextPresentValue(column.Key, index); // no neighbours found if (prevIndex < 0 && nextIndex >= countValues) { continue; } int valuesToInterpolate = nextIndex - prevIndex; if (preprocessingData.IsType(column.Key)) { double prev = preprocessingData.GetCell(column.Key, prevIndex); double next = preprocessingData.GetCell(column.Key, nextIndex); double interpolationStep = (next - prev) / valuesToInterpolate; for (int i = prevIndex; i < nextIndex; ++i) { double interpolated = prev + (interpolationStep * (i - prevIndex)); preprocessingData.SetCell(column.Key, i, interpolated); } } else if (preprocessingData.IsType(column.Key)) { DateTime prev = preprocessingData.GetCell(column.Key, prevIndex); DateTime next = preprocessingData.GetCell(column.Key, nextIndex); double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate; for (int i = prevIndex; i < nextIndex; ++i) { DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex)); preprocessingData.SetCell(column.Key, i, interpolated); } } } } } }); } private int indexOfPrevPresentValue(int columnIndex, int start) { int offset = start - 1; while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) { offset--; } return offset; } private int indexOfNextPresentValue(int columnIndex, int start) { int offset = start + 1; while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) { offset++; } return offset; } public void ReplaceIndicesByMostCommonValue(IDictionary> cells, bool considerSelection) { preprocessingData.InTransaction(() => { foreach (var column in cells) { if (preprocessingData.IsType(column.Key)) { ReplaceIndicesByValue(column.Key, column.Value, statisticsLogic.GetMostCommonValue(column.Key, considerSelection)); } else if (preprocessingData.IsType(column.Key)) { ReplaceIndicesByValue(column.Key, column.Value, statisticsLogic.GetMostCommonValue(column.Key, considerSelection)); } else if (preprocessingData.IsType(column.Key)) { ReplaceIndicesByValue(column.Key, column.Value, statisticsLogic.GetMostCommonValue(column.Key, considerSelection)); } else { throw new ArgumentException("column with index: " + column.Key + " contains a non supported type."); } } }); } public void ShuffleWithRanges() { ShuffleWithRanges(new[] { preprocessingData.TestPartition, preprocessingData.TrainingPartition }); } public void ShuffleWithRanges(IEnumerable ranges) { // init random outside loop Random random = new Random(); preprocessingData.InTransaction(() => { // process all given ranges - e.g. TrainingPartition, TestPartition foreach (IntRange range in ranges) { List> shuffledIndices = new List>(); // generate random indices used for shuffeling each column for (int i = range.End - 1; i >= range.Start; --i) { int rand = random.Next(range.Start, i); shuffledIndices.Add(new Tuple(i, rand)); } ShuffleToIndices(shuffledIndices); } }); } public void ReOrderToIndices(IEnumerable indices) { List> indicesTuple = new List>(); for (int i = 0; i < indices.Count(); ++i) { indicesTuple.Add(new Tuple(i, indices.ElementAt(i))); } ReOrderToIndices(indicesTuple); } public void ReOrderToIndices(IList> indices) { preprocessingData.InTransaction(() => { for (int i = 0; i < preprocessingData.Columns; ++i) { if (preprocessingData.IsType(i)) { reOrderToIndices(i, indices); } else if (preprocessingData.IsType(i)) { reOrderToIndices(i, indices); } else if (preprocessingData.IsType(i)) { reOrderToIndices(i, indices); } } }); } public void ShuffleToIndices(IList> indices) { preprocessingData.InTransaction(() => { for (int i = 0; i < preprocessingData.Columns; ++i) { if (preprocessingData.IsType(i)) { ShuffleToIndices(i, indices); } else if (preprocessingData.IsType(i)) { ShuffleToIndices(i, indices); } else if (preprocessingData.IsType(i)) { ShuffleToIndices(i, indices); } } }); } private void reOrderToIndices(int columnIndex, IList> indices) { List originalData = new List(preprocessingData.GetValues(columnIndex, false)); // process all columns equally foreach (Tuple index in indices) { int originalIndex = index.Item1; int replaceIndex = index.Item2; T replaceValue = originalData.ElementAt(replaceIndex); preprocessingData.SetCell(columnIndex, originalIndex, replaceValue); } } private void ShuffleToIndices(int columnIndex, IList> indices) { // process all columns equally foreach (Tuple index in indices) { int originalIndex = index.Item1; int replaceIndex = index.Item2; T tmp = preprocessingData.GetCell(columnIndex, originalIndex); T replaceValue = preprocessingData.GetCell(columnIndex, replaceIndex); preprocessingData.SetCell(columnIndex, originalIndex, replaceValue); preprocessingData.SetCell(columnIndex, replaceIndex, tmp); } } public void ReplaceIndicesByValue(IDictionary> cells, string value) { preprocessingData.InTransaction(() => { foreach (var column in cells) { foreach (var rowIdx in column.Value) { dataGridLogic.SetValue(value, column.Key, rowIdx); } } }); } public List RowsWithMissingValuesGreater(double percent) { List rows= new List(); for (int i = 0; i < preprocessingData.Rows; ++i) { int missingCount = statisticsLogic.GetRowMissingValueCount(i); if (100f / preprocessingData.Columns * missingCount > percent) { rows.Add(i); } } return rows; } public List ColumnsWithMissingValuesGreater(double percent) { List columns = new List(); for (int i = 0; i < preprocessingData.Columns; ++i) { int missingCount = statisticsLogic.GetMissingValueCount(i); if (100f / preprocessingData.Rows * missingCount > percent) { columns.Add(i); } } return columns; } public List ColumnsWithVarianceSmaller(double variance) { List columns = new List(); for (int i = 0; i < preprocessingData.Columns; ++i) { if (preprocessingData.IsType(i) || preprocessingData.IsType(i)) { double columnVariance = statisticsLogic.GetVariance(i); if (columnVariance < variance) { columns.Add(i); } } } return columns; } public void DeleteRowsWithMissingValuesGreater(double percent) { DeleteRows(RowsWithMissingValuesGreater(percent)); } public void DeleteColumnsWithMissingValuesGreater(double percent) { DeleteColumns(ColumnsWithMissingValuesGreater(percent)); } public void DeleteColumnsWithVarianceSmaller(double variance) { DeleteColumns(ColumnsWithVarianceSmaller(variance)); } private void DeleteRows(List rows) { rows.Sort(); rows.Reverse(); preprocessingData.InTransaction(() => { foreach (int row in rows) { preprocessingData.DeleteRow(row); } }); } private void DeleteColumns(List columns) { columns.Sort(); columns.Reverse(); preprocessingData.InTransaction(() => { foreach (int column in columns) { preprocessingData.DeleteColumn(column); } }); } public event DataPreprocessingChangedEventHandler Changed { add { dataGridLogic.Changed += value; } remove { dataGridLogic.Changed -= value; } } } }