#region License Information /* HeuristicLab * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections; using System.Collections.Generic; using System.Globalization; using System.Linq; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; using HeuristicLab.Problems.DataAnalysis; namespace HeuristicLab.DataPreprocessing { [Item("PreprocessingData", "Represents data used for preprocessing.")] [StorableClass] public class PreprocessingData : NamedItem, IPreprocessingData { [Storable] private List dataColumns; public IList DataColumns { get { return dataColumns; } } #region Constructor, Cloning & Persistence public PreprocessingData(IDataAnalysisProblemData problemData) : base() { Name = "Preprocessing Data"; dataColumns = new List(); Transformations = new List(); selection = new Dictionary>(); Import(problemData); RegisterEventHandler(); } protected PreprocessingData(PreprocessingData original, Cloner cloner) : base(original, cloner) { dataColumns = new List(original.dataColumns.Select(cloner.Clone)); TrainingPartition = cloner.Clone(original.TrainingPartition); TestPartition = cloner.Clone(original.TestPartition); Transformations = new List(original.Transformations.Select(cloner.Clone)); InputVariables = new List(original.InputVariables); TargetVariable = original.TargetVariable; RegisterEventHandler(); } public override IDeepCloneable Clone(Cloner cloner) { return new PreprocessingData(this, cloner); } [StorableConstructor] protected PreprocessingData(bool deserializing) : base(deserializing) { } [StorableHook(HookType.AfterDeserialization)] private void AfterDeserialization() { RegisterEventHandler(); } private void RegisterEventHandler() { Changed += (s, e) => { switch (e.Type) { case DataPreprocessingChangedEventType.DeleteRow: case DataPreprocessingChangedEventType.Any: case DataPreprocessingChangedEventType.Transformation: int maxRowIndex = Math.Max(0, Rows); TrainingPartition.Start = Math.Min(TrainingPartition.Start, maxRowIndex); TrainingPartition.End = Math.Min(TrainingPartition.End, maxRowIndex); TestPartition.Start = Math.Min(TestPartition.Start, maxRowIndex); TestPartition.End = Math.Min(TestPartition.End, maxRowIndex); break; } }; } #endregion #region Cells public bool IsCellEmpty(int columnIndex, int rowIndex) { return !dataColumns[columnIndex].IsValidValue(rowIndex); } public T GetCell(int columnIndex, int rowIndex) { return dataColumns[columnIndex].TypeSwitch( c => c[rowIndex], c => c[rowIndex], c => c[rowIndex]); } public void SetCell(int columnIndex, int rowIndex, T value) { SaveSnapshot(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex); for (int i = Rows; i <= rowIndex; i++) InsertRow(i); for (int i = Columns; i <= columnIndex; i++) InsertColumn(i.ToString(), i); dataColumns[columnIndex].TypeSwitch(value, (c, v) => c[rowIndex] = v, (c, v) => c[rowIndex] = v, (c, v) => c[rowIndex] = v); if (!IsInTransaction) OnChanged(DataPreprocessingChangedEventType.ChangeItem, columnIndex, rowIndex); } public string GetCellAsString(int columnIndex, int rowIndex) { return dataColumns[columnIndex].GetValue(rowIndex); } public IEnumerable GetValues(int columnIndex, bool considerSelection) { return dataColumns[columnIndex].TypeSwitch( c => c.GetValues(considerSelection ? selection[columnIndex] : null), c => c.GetValues(considerSelection ? selection[columnIndex] : null), c => c.GetValues(considerSelection ? selection[columnIndex] : null)); } public void SetValues(int columnIndex, IEnumerable values) { SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); if (VariableHasType(columnIndex)) { var name = dataColumns[columnIndex].Name; if (dataColumns[columnIndex].IsType()) { dataColumns[columnIndex] = new DoublePreprocessingDataColumn(name, (IEnumerable)values); } else if (dataColumns[columnIndex].IsType()) { dataColumns[columnIndex] = new StringPreprocessingDataColumn(name, (IEnumerable)values); } else if (dataColumns[columnIndex].IsType()) { dataColumns[columnIndex] = new DateTimePreprocessingDataColumn(name, (IEnumerable)values); } else { throw new ArgumentException("Unknown column type"); } } else { throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + dataColumns[columnIndex].GetType().Name + " but was " + typeof(T).Name); } if (!IsInTransaction) OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); } public bool SetValue(string value, int columnIndex, int rowIndex) { var column = dataColumns[columnIndex]; bool successful = column.SetValue(value, rowIndex); if (!IsInTransaction) OnChanged(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); return successful; } public int Columns { get { return dataColumns.Count; } } public int Rows { get { return dataColumns.Any() ? dataColumns.Max(c => c.Length) : 0; } } #endregion #region Rows public void InsertRow(int rowIndex) { SaveSnapshot(DataPreprocessingChangedEventType.DeleteRow, -1, rowIndex); foreach (var column in dataColumns) { column.TypeSwitch( c => c.Values.Insert(rowIndex, double.NaN), c => c.Values.Insert(rowIndex, null), c => c.Values.Insert(rowIndex, DateTime.MinValue)); } if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { TrainingPartition.End++; if (TrainingPartition.End <= TestPartition.Start) { TestPartition.Start++; TestPartition.End++; } } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { TestPartition.End++; if (TestPartition.End <= TrainingPartition.Start) { TestPartition.Start++; TestPartition.End++; } } if (!IsInTransaction) OnChanged(DataPreprocessingChangedEventType.AddRow, -1, rowIndex); } public void DeleteRow(int rowIndex) { DeleteRows(new[] { rowIndex }); } public void DeleteRows(IEnumerable rowIndices) { SaveSnapshot(DataPreprocessingChangedEventType.AddRow, -1, -1); foreach (int rowIndex in rowIndices.OrderByDescending(x => x)) { foreach (var column in dataColumns) { column.TypeSwitch( c => c.Values.RemoveAt(rowIndex), c => c.Values.RemoveAt(rowIndex), c => c.Values.RemoveAt(rowIndex)); } if (TrainingPartition.Start <= rowIndex && rowIndex <= TrainingPartition.End) { TrainingPartition.End--; if (TrainingPartition.End <= TestPartition.Start) { TestPartition.Start--; TestPartition.End--; } } else if (TestPartition.Start <= rowIndex && rowIndex <= TestPartition.End) { TestPartition.End--; if (TestPartition.End <= TrainingPartition.Start) { TestPartition.Start--; TestPartition.End--; } } } if (!IsInTransaction) OnChanged(DataPreprocessingChangedEventType.DeleteRow, -1, -1); } public void InsertColumn(string variableName, int columnIndex) { SaveSnapshot(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); if (typeof(T) == typeof(double)) { dataColumns.Insert(columnIndex, new DoublePreprocessingDataColumn(variableName, Enumerable.Repeat(double.NaN, Rows))); } else if (typeof(T) == typeof(string)) { dataColumns.Insert(columnIndex, new StringPreprocessingDataColumn(variableName, Enumerable.Repeat(string.Empty, Rows))); } else if (typeof(T) == typeof(DateTime)) { dataColumns.Insert(columnIndex, new DateTimePreprocessingDataColumn(variableName, Enumerable.Repeat(DateTime.MinValue, Rows))); } else { throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime"); } if (!IsInTransaction) OnChanged(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); } public void DeleteColumn(int columnIndex) { SaveSnapshot(DataPreprocessingChangedEventType.AddColumn, columnIndex, -1); dataColumns.RemoveAt(columnIndex); if (!IsInTransaction) OnChanged(DataPreprocessingChangedEventType.DeleteColumn, columnIndex, -1); } public void RenameColumn(int columnIndex, string name) { if (columnIndex < 0 || columnIndex > dataColumns.Count) throw new ArgumentOutOfRangeException("columnIndex"); SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, columnIndex, -1); dataColumns[columnIndex].Name = name; if (!IsInTransaction) OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); } public void RenameColumns(IList names) { if (names == null) throw new ArgumentNullException("names"); if (names.Count != dataColumns.Count) throw new ArgumentException("number of names must match the number of columns.", "names"); SaveSnapshot(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); for (int i = 0; i < names.Count; i++) dataColumns[i].Name = names[i]; if (!IsInTransaction) OnChanged(DataPreprocessingChangedEventType.ChangeColumn, -1, -1); } public bool AreAllStringColumns(IEnumerable columnIndices) { return columnIndices.All(VariableHasType); } #endregion #region Variables public IEnumerable VariableNames { get { return dataColumns.Select(c => c.Name); } } public IEnumerable GetDoubleVariableNames() { return dataColumns.OfType().Select(c => c.Name); } public string GetVariableName(int columnIndex) { return dataColumns[columnIndex].Name; } public int GetColumnIndex(string variableName) { return dataColumns.FindIndex(c => c.Name == variableName); } public bool VariableHasType(int columnIndex) { return dataColumns[columnIndex].IsType(); } public Type GetVariableType(int columnIndex) { return dataColumns[columnIndex].GetValueType(); } public IList InputVariables { get; private set; } public string TargetVariable { get; private set; } // optional #endregion #region Partitions [Storable] public IntRange TrainingPartition { get; set; } [Storable] public IntRange TestPartition { get; set; } #endregion #region Transformations [Storable] public IList Transformations { get; protected set; } #endregion #region Validation public bool Validate(string value, out string errorMessage, int columnIndex) { if (columnIndex < 0 || columnIndex > VariableNames.Count()) { throw new ArgumentOutOfRangeException("column index is out of range"); } bool valid = false; errorMessage = string.Empty; if (VariableHasType(columnIndex)) { if (string.IsNullOrWhiteSpace(value)) { valid = true; } else { double val; valid = double.TryParse(value, out val); if (!valid) { errorMessage = "Invalid Value (Valid Value Format: \"" + FormatPatterns.GetDoubleFormatPattern() + "\")"; } } } else if (VariableHasType(columnIndex)) { valid = value != null; if (!valid) { errorMessage = "Invalid Value (string must not be null)"; } } else if (VariableHasType(columnIndex)) { DateTime date; valid = DateTime.TryParse(value, out date); if (!valid) { errorMessage = "Invalid Value (Valid Value Format: \"" + CultureInfo.CurrentCulture.DateTimeFormat + "\""; } } else { throw new ArgumentException("column " + columnIndex + " contains a non supported type."); } return valid; } #endregion #region Import & Export public void Import(IDataAnalysisProblemData problemData) { var dataset = problemData.Dataset; InputVariables = new List(problemData.AllowedInputVariables); TargetVariable = problemData is IRegressionProblemData ? ((IRegressionProblemData)problemData).TargetVariable : problemData is IClassificationProblemData ? ((IClassificationProblemData)problemData).TargetVariable : null; dataColumns.Clear(); foreach (var variableName in problemData.Dataset.VariableNames) { if (dataset.VariableHasType(variableName)) { dataColumns.Add(new DoublePreprocessingDataColumn(variableName, dataset.GetDoubleValues(variableName))); } else if (dataset.VariableHasType(variableName)) { dataColumns.Add(new StringPreprocessingDataColumn(variableName, dataset.GetStringValues(variableName))); } else if (dataset.VariableHasType(variableName)) { dataColumns.Add(new DateTimePreprocessingDataColumn(variableName, dataset.GetDateTimeValues(variableName))); } else { throw new ArgumentException("The datatype of column " + variableName + " must be of type double, string or DateTime"); } } TrainingPartition = new IntRange(problemData.TrainingPartition.Start, problemData.TrainingPartition.End); TestPartition = new IntRange(problemData.TestPartition.Start, problemData.TestPartition.End); } public Dataset ExportToDataset() { IList values = new List(); for (int i = 0; i < Columns; i++) { var doubleColumn = dataColumns[i] as DoublePreprocessingDataColumn; var stringColumn = dataColumns[i] as StringPreprocessingDataColumn; var dateTimeColumn = dataColumns[i] as DateTimePreprocessingDataColumn; if (doubleColumn != null) values.Add(new List(doubleColumn.GetValues())); else if (stringColumn != null) values.Add(new List(stringColumn.GetValues())); else if (dateTimeColumn != null) values.Add(new List(dateTimeColumn.GetValues())); else throw new InvalidOperationException("Column type not supported for export"); } return new Dataset(VariableNames, values); } #endregion #region Selection [Storable] protected IDictionary> selection; public IDictionary> Selection { get { return selection; } set { selection = value; OnSelectionChanged(); } } public void ClearSelection() { Selection = new Dictionary>(); } public event EventHandler SelectionChanged; protected void OnSelectionChanged() { var listeners = SelectionChanged; if (listeners != null) listeners(this, EventArgs.Empty); } #endregion #region Transactions // Snapshot/History are not storable/cloneable on purpose private class Snapshot { public List DataColumns { get; set; } public IntRange TrainingPartition { get; set; } public IntRange TestPartition { get; set; } public IList Transformations { get; set; } public DataPreprocessingChangedEventType ChangedType { get; set; } public int ChangedColumn { get; set; } public int ChangedRow { get; set; } } public event DataPreprocessingChangedEventHandler Changed; protected virtual void OnChanged(DataPreprocessingChangedEventType type, int column, int row) { var listeners = Changed; if (listeners != null) listeners(this, new DataPreprocessingChangedEventArgs(type, column, row)); } private const int MaxUndoDepth = 5; private readonly IList undoHistory = new List(); private readonly Stack eventStack = new Stack(); public bool IsInTransaction { get { return eventStack.Count > 0; } } private void SaveSnapshot(DataPreprocessingChangedEventType changedType, int column, int row) { if (IsInTransaction) return; var cloner = new Cloner(); var currentSnapshot = new Snapshot { DataColumns = new List(dataColumns.Select(cloner.Clone)), TrainingPartition = new IntRange(TrainingPartition.Start, TrainingPartition.End), TestPartition = new IntRange(TestPartition.Start, TestPartition.End), Transformations = new List(Transformations), ChangedType = changedType, ChangedColumn = column, ChangedRow = row }; if (undoHistory.Count >= MaxUndoDepth) undoHistory.RemoveAt(0); undoHistory.Add(currentSnapshot); } public bool IsUndoAvailable { get { return undoHistory.Count > 0; } } public void Undo() { if (IsUndoAvailable) { Snapshot previousSnapshot = undoHistory[undoHistory.Count - 1]; dataColumns = previousSnapshot.DataColumns; TrainingPartition = previousSnapshot.TrainingPartition; TestPartition = previousSnapshot.TestPartition; Transformations = previousSnapshot.Transformations; undoHistory.Remove(previousSnapshot); OnChanged(previousSnapshot.ChangedType, previousSnapshot.ChangedColumn, previousSnapshot.ChangedRow); } } public void InTransaction(Action action, DataPreprocessingChangedEventType type = DataPreprocessingChangedEventType.Any) { BeginTransaction(type); action(); EndTransaction(); } public void BeginTransaction(DataPreprocessingChangedEventType type) { SaveSnapshot(type, -1, -1); eventStack.Push(type); } public void EndTransaction() { if (eventStack.Count == 0) throw new InvalidOperationException("There is no open transaction that can be ended."); var @event = eventStack.Pop(); OnChanged(@event, -1, -1); } #endregion /* #region Statistics public T GetMin(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { try { return dataColumns[columnIndex].TypeSwitch( col => col.GetMin(considerSelection ? Selection[columnIndex] : null), col => col.GetMin(considerSelection ? Selection[columnIndex] : null), col => col.GetMin(considerSelection ? Selection[columnIndex] : null)); } catch (InvalidOperationException) { return emptyValue; } } public T GetMax(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); return values.Any() ? values.Max() : emptyValue; } public T GetMean(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { return if (typeof(T) == typeof(double)) { var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); return values.Any() ? Convert(values.Average()) : emptyValue; } if (typeof(T) == typeof(string)) { return Convert(string.Empty); } if (typeof(T) == typeof(DateTime)) { var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); return values.Any() ? Convert(AggregateAsDouble(values, Enumerable.Average)) : emptyValue; } throw new InvalidOperationException(typeof(T) + " not supported"); } public T GetMedian(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable { if (typeof(T) == typeof(double)) {// IEnumerable is faster var doubleValues = GetValuesWithoutMissingValues(columnIndex, considerSelection); return doubleValues.Any() ? Convert(doubleValues.Median()) : emptyValue; } var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); return values.Any() ? values.Quantile(0.5) : emptyValue; } public T GetMode(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IEquatable { var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); return values.Any() ? values.GroupBy(x => x).OrderByDescending(g => g.Count()).Select(g => g.Key).First() : emptyValue; } public T GetStandardDeviation(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { if (typeof(T) == typeof(double)) { var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); return values.Any() ? Convert(values.StandardDeviation()) : emptyValue; } // For DateTime, std.dev / variance would have to be TimeSpan //if (typeof(T) == typeof(DateTime)) { // var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); // return values.Any() ? Convert(AggregateAsDouble(values, EnumerableStatisticExtensions.StandardDeviation)) : emptyValue; //} return default(T); } public T GetVariance(int columnIndex, bool considerSelection = false, T emptyValue = default(T)) { if (typeof(T) == typeof(double)) { var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); return values.Any() ? Convert(values.Variance()) : emptyValue; } // DateTime variance often overflows long, thus the corresponding DateTime is invalid //if (typeof(T) == typeof(DateTime)) { // var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); // return values.Any() ? Convert(AggregateAsDouble(values, EnumerableStatisticExtensions.Variance)) : emptyValue; //} return default(T); } public T GetQuantile(double alpha, int columnIndex, bool considerSelection = false, T emptyValue = default(T)) where T : IComparable { if (typeof(T) == typeof(double)) {// IEnumerable is faster var doubleValues = GetValuesWithoutMissingValues(columnIndex, considerSelection); return doubleValues.Any() ? Convert(doubleValues.Quantile(alpha)) : emptyValue; } var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); return values.Any() ? values.Quantile(alpha) : emptyValue; } public int GetDistinctValues(int columnIndex, bool considerSelection = false) { var values = GetValuesWithoutMissingValues(columnIndex, considerSelection); return values.GroupBy(x => x).Count(); } private IEnumerable GetValuesWithoutMissingValues(int columnIndex, bool considerSelection) { return GetValues(columnIndex, considerSelection).Where(x => ColumnTypeSwitch(dataColumns[columnIndex], x, (c, v) => c.IsValidValue(v), (c, v) => c.IsValidValue(v), (c, v) => c.IsValidValue(v) )); } private static DateTime AggregateAsDouble(IEnumerable values, Func, double> func) { return new DateTime((long)(func(values.Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond)) * TimeSpan.TicksPerSecond)); } public int GetMissingValueCount() { int count = 0; for (int i = 0; i < Columns; ++i) { count += GetMissingValueCount(i); } return count; } public int GetMissingValueCount(int columnIndex) { int sum = 0; for (int i = 0; i < Rows; i++) { if (IsCellEmpty(columnIndex, i)) sum++; } return sum; } public int GetRowMissingValueCount(int rowIndex) { int sum = 0; for (int i = 0; i < Columns; i++) { if (IsCellEmpty(i, rowIndex)) sum++; } return sum; } #endregion */ } // Adapted from HeuristicLab.Common.EnumerableStatisticExtensions internal static class EnumerableExtensions { public static T Quantile(this IEnumerable values, double alpha) where T : IComparable { T[] valuesArr = values.ToArray(); int n = valuesArr.Length; if (n == 0) throw new InvalidOperationException("Enumeration contains no elements."); var pos = n * alpha; return Select((int)Math.Ceiling(pos) - 1, valuesArr); } private static T Select(int k, T[] arr) where T : IComparable { int i, ir, j, l, mid, n = arr.Length; T a; l = 0; ir = n - 1; for (;;) { if (ir <= l + 1) { // Active partition contains 1 or 2 elements. if (ir == l + 1 && arr[ir].CompareTo(arr[l]) < 0) { // Case of 2 elements. Swap(arr, l, ir); } return arr[k]; } else { mid = (l + ir) >> 1; // Choose median of left, center, and right elements Swap(arr, mid, l + 1); // as partitioning element a. Also if (arr[l].CompareTo(arr[ir]) > 0) { // rearrange so that arr[l] arr[ir] <= arr[l+1], Swap(arr, l, ir); // . arr[ir] >= arr[l+1] } if (arr[l + 1].CompareTo(arr[ir]) > 0) { Swap(arr, l + 1, ir); } if (arr[l].CompareTo(arr[l + 1]) > 0) { Swap(arr, l, l + 1); } i = l + 1; // Initialize pointers for partitioning. j = ir; a = arr[l + 1]; // Partitioning element. for (;;) { // Beginning of innermost loop. do i++; while (arr[i].CompareTo(a) < 0); // Scan up to find element > a. do j--; while (arr[j].CompareTo(a) > 0); // Scan down to find element < a. if (j < i) break; // Pointers crossed. Partitioning complete. Swap(arr, i, j); } // End of innermost loop. arr[l + 1] = arr[j]; // Insert partitioning element. arr[j] = a; if (j >= k) ir = j - 1; // Keep active the partition that contains the if (j <= k) l = i; // kth element. } } } private static void Swap(T[] arr, int i, int j) { T temp = arr[i]; arr[i] = arr[j]; arr[j] = temp; } } }