#region License Information
/* HeuristicLab
* Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Collections.Generic;
using System.Linq;
using HeuristicLab.Data;
using HeuristicLab.Random;
namespace HeuristicLab.DataPreprocessing {
public class ManipulationLogic {
private readonly ITransactionalPreprocessingData preprocessingData;
private readonly StatisticsLogic statisticsLogic;
private readonly SearchLogic searchLogic;
public IEnumerable VariableNames {
get { return preprocessingData.VariableNames; }
}
public ITransactionalPreprocessingData PreProcessingData {
get { return preprocessingData; }
}
public ManipulationLogic(ITransactionalPreprocessingData _prepocessingData, SearchLogic theSearchLogic, StatisticsLogic theStatisticsLogic) {
preprocessingData = _prepocessingData;
searchLogic = theSearchLogic;
statisticsLogic = theStatisticsLogic;
}
public void ReplaceIndicesByValue(int columnIndex, IEnumerable rowIndices, T value) {
foreach (int index in rowIndices) {
preprocessingData.SetCell(columnIndex, index, value);
}
}
public void ReplaceIndicesByAverageValue(IDictionary> cells, bool considerSelection = false) {
preprocessingData.InTransaction(() => {
foreach (var column in cells) {
if (preprocessingData.VariableHasType(column.Key)) {
double average = statisticsLogic.GetAverage(column.Key, considerSelection);
ReplaceIndicesByValue(column.Key, column.Value, average);
} else if (preprocessingData.VariableHasType(column.Key)) {
DateTime average = statisticsLogic.GetAverageDateTime(column.Key, considerSelection);
ReplaceIndicesByValue(column.Key, column.Value, average);
}
}
});
}
public void ReplaceIndicesByMedianValue(IDictionary> cells, bool considerSelection = false) {
preprocessingData.InTransaction(() => {
foreach (var column in cells) {
if (preprocessingData.VariableHasType(column.Key)) {
double median = statisticsLogic.GetMedian(column.Key, considerSelection);
ReplaceIndicesByValue(column.Key, column.Value, median);
} else if (preprocessingData.VariableHasType(column.Key)) {
DateTime median = statisticsLogic.GetMedianDateTime(column.Key, considerSelection);
ReplaceIndicesByValue(column.Key, column.Value, median);
}
}
});
}
public void ReplaceIndicesByRandomValue(IDictionary> cells, bool considerSelection = false) {
preprocessingData.InTransaction(() => {
System.Random r = new System.Random();
foreach (var column in cells) {
if (preprocessingData.VariableHasType(column.Key)) {
double max = statisticsLogic.GetMax(column.Key, double.NaN, considerSelection);
double min = statisticsLogic.GetMin(column.Key, double.NaN, considerSelection);
double randMultiplier = (max - min);
foreach (int index in column.Value) {
double rand = r.NextDouble() * randMultiplier + min;
preprocessingData.SetCell(column.Key, index, rand);
}
} else if (preprocessingData.VariableHasType(column.Key)) {
DateTime min = statisticsLogic.GetMin(column.Key, DateTime.MinValue, considerSelection);
DateTime max = statisticsLogic.GetMax(column.Key, DateTime.MinValue, considerSelection);
double randMultiplier = (max - min).TotalSeconds;
foreach (int index in column.Value) {
double rand = r.NextDouble() * randMultiplier;
preprocessingData.SetCell(column.Key, index, min.AddSeconds(rand));
}
}
}
});
}
public void ReplaceIndicesByLinearInterpolationOfNeighbours(IDictionary> cells) {
preprocessingData.InTransaction(() => {
foreach (var column in cells) {
int countValues = 0;
if (preprocessingData.VariableHasType(column.Key)) {
countValues = preprocessingData.GetValues(column.Key).Count();
} else if (preprocessingData.VariableHasType(column.Key)) {
countValues = preprocessingData.GetValues(column.Key).Count();
}
IList> startEndings = GetStartAndEndingsForInterpolation(column);
foreach (var tuple in startEndings) {
Interpolate(column, tuple.Item1, tuple.Item2);
}
}
});
}
private List> GetStartAndEndingsForInterpolation(KeyValuePair> column) {
List> startEndings = new List>();
var rowIndices = column.Value;
rowIndices = rowIndices.OrderBy(x => x).ToList();
var count = rowIndices.Count;
int start = int.MinValue;
for (int i = 0; i < count; ++i) {
if (start == int.MinValue) {
start = indexOfPrevPresentValue(column.Key, rowIndices[i]);
}
if (i + 1 == count || (i + 1 < count && rowIndices[i + 1] - rowIndices[i] > 1)) {
int next = indexOfNextPresentValue(column.Key, rowIndices[i]);
if (start > 0 && next < preprocessingData.Rows) {
startEndings.Add(new Tuple(start, next));
}
start = int.MinValue;
}
}
return startEndings;
}
public void ReplaceIndicesBySmoothing(IDictionary> cells) {
preprocessingData.InTransaction(() => {
foreach (var column in cells) {
int countValues = preprocessingData.Rows;
foreach (int index in column.Value) {
// dont replace first or last values
if (index > 0 && index < countValues) {
int prevIndex = indexOfPrevPresentValue(column.Key, index);
int nextIndex = indexOfNextPresentValue(column.Key, index);
// no neighbours found
if (prevIndex < 0 || nextIndex >= countValues) {
continue;
}
Interpolate(column, prevIndex, nextIndex);
}
}
}
});
}
private void Interpolate(KeyValuePair> column, int prevIndex, int nextIndex) {
int valuesToInterpolate = nextIndex - prevIndex;
if (preprocessingData.VariableHasType(column.Key)) {
double prev = preprocessingData.GetCell(column.Key, prevIndex);
double next = preprocessingData.GetCell(column.Key, nextIndex);
double interpolationStep = (next - prev) / valuesToInterpolate;
for (int i = prevIndex; i < nextIndex; ++i) {
double interpolated = prev + (interpolationStep * (i - prevIndex));
preprocessingData.SetCell(column.Key, i, interpolated);
}
} else if (preprocessingData.VariableHasType(column.Key)) {
DateTime prev = preprocessingData.GetCell(column.Key, prevIndex);
DateTime next = preprocessingData.GetCell(column.Key, nextIndex);
double interpolationStep = (next - prev).TotalSeconds / valuesToInterpolate;
for (int i = prevIndex; i < nextIndex; ++i) {
DateTime interpolated = prev.AddSeconds(interpolationStep * (i - prevIndex));
preprocessingData.SetCell(column.Key, i, interpolated);
}
}
}
private int indexOfPrevPresentValue(int columnIndex, int start) {
int offset = start - 1;
while (offset >= 0 && searchLogic.IsMissingValue(columnIndex, offset)) {
offset--;
}
return offset;
}
private int indexOfNextPresentValue(int columnIndex, int start) {
int offset = start + 1;
while (offset < preprocessingData.Rows && searchLogic.IsMissingValue(columnIndex, offset)) {
offset++;
}
return offset;
}
public void ReplaceIndicesByMostCommonValue(IDictionary> cells, bool considerSelection = false) {
preprocessingData.InTransaction(() => {
foreach (var column in cells) {
if (preprocessingData.VariableHasType(column.Key)) {
ReplaceIndicesByValue(column.Key, column.Value, statisticsLogic.GetMostCommonValue(column.Key, double.NaN, considerSelection));
} else if (preprocessingData.VariableHasType(column.Key)) {
ReplaceIndicesByValue(column.Key, column.Value, statisticsLogic.GetMostCommonValue(column.Key, string.Empty, considerSelection));
} else if (preprocessingData.VariableHasType(column.Key)) {
ReplaceIndicesByValue(column.Key, column.Value, statisticsLogic.GetMostCommonValue(column.Key, DateTime.MinValue, considerSelection));
} else {
throw new ArgumentException("column with index: " + column.Key + " contains a non supported type.");
}
}
});
}
public void Shuffle(bool shuffleRangesSeparately) {
var random = new FastRandom();
if (shuffleRangesSeparately) {
var ranges = new[] { preprocessingData.TestPartition, preprocessingData.TrainingPartition };
preprocessingData.InTransaction(() => {
// process all given ranges - e.g. TrainingPartition, TestPartition
foreach (IntRange range in ranges) {
var indices = Enumerable.Range(0, preprocessingData.Rows).ToArray();
var shuffledIndices = Enumerable.Range(range.Start, range.Size).Shuffle(random).ToArray();
for (int i = range.Start, j = 0; i < range.End; i++, j++)
indices[i] = shuffledIndices[j];
ReOrderToIndices(indices);
}
});
} else {
preprocessingData.InTransaction(() => {
var indices = Enumerable.Range(0, preprocessingData.Rows);
var shuffledIndices = indices.Shuffle(random).ToArray();
ReOrderToIndices(shuffledIndices);
});
}
}
public void ReOrderToIndices(int[] indices) {
preprocessingData.InTransaction(() => {
for (int i = 0; i < preprocessingData.Columns; ++i) {
if (preprocessingData.VariableHasType(i)) {
ReOrderToIndices(i, indices);
} else if (preprocessingData.VariableHasType(i)) {
ReOrderToIndices(i, indices);
} else if (preprocessingData.VariableHasType(i)) {
ReOrderToIndices(i, indices);
}
}
});
}
private void ReOrderToIndices(int columnIndex, int[] indices) {
List originalData = new List(preprocessingData.GetValues(columnIndex));
if (indices.Length != originalData.Count) throw new InvalidOperationException("The number of provided indices does not match the values.");
for (int i = 0; i < indices.Length; i++) {
int originalIndex = i;
int replaceIndex = indices[i];
T replaceValue = originalData.ElementAt(replaceIndex);
preprocessingData.SetCell(columnIndex, originalIndex, replaceValue);
}
}
public void ReplaceIndicesByValue(IDictionary> cells, string value) {
preprocessingData.InTransaction(() => {
foreach (var column in cells) {
foreach (var rowIdx in column.Value) {
preprocessingData.SetValue(value, column.Key, rowIdx);
}
}
});
}
public List RowsWithMissingValuesGreater(double percent) {
List rows = new List();
for (int i = 0; i < preprocessingData.Rows; ++i) {
int missingCount = statisticsLogic.GetRowMissingValueCount(i);
if (100f / preprocessingData.Columns * missingCount > percent) {
rows.Add(i);
}
}
return rows;
}
public List ColumnsWithMissingValuesGreater(double percent) {
List columns = new List();
for (int i = 0; i < preprocessingData.Columns; ++i) {
int missingCount = statisticsLogic.GetMissingValueCount(i);
if (100f / preprocessingData.Rows * missingCount > percent) {
columns.Add(i);
}
}
return columns;
}
public List ColumnsWithVarianceSmaller(double variance) {
List columns = new List();
for (int i = 0; i < preprocessingData.Columns; ++i) {
if (preprocessingData.VariableHasType(i) || preprocessingData.VariableHasType(i)) {
double columnVariance = statisticsLogic.GetVariance(i);
if (columnVariance < variance) {
columns.Add(i);
}
}
}
return columns;
}
public void DeleteRowsWithMissingValuesGreater(double percent) {
DeleteRows(RowsWithMissingValuesGreater(percent));
}
public void DeleteColumnsWithMissingValuesGreater(double percent) {
DeleteColumns(ColumnsWithMissingValuesGreater(percent));
}
public void DeleteColumnsWithVarianceSmaller(double variance) {
DeleteColumns(ColumnsWithVarianceSmaller(variance));
}
private void DeleteRows(List rows) {
rows.Sort();
rows.Reverse();
preprocessingData.InTransaction(() => {
foreach (int row in rows) {
preprocessingData.DeleteRow(row);
}
});
}
private void DeleteColumns(List columns) {
columns.Sort();
columns.Reverse();
preprocessingData.InTransaction(() => {
foreach (int column in columns) {
preprocessingData.DeleteColumn(column);
}
});
}
}
}