#region License Information
/* HeuristicLab
* Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using HeuristicLab.Common;
using HeuristicLab.Core;
using HeuristicLab.Data;
using HeuristicLab.Problems.DataAnalysis;
namespace HeuristicLab.DataPreprocessing {
[Item("PreprocessingData", "Represents data used for preprocessing.")]
public class PreprocessingData : NamedItem, IPreprocessingData {
protected IDictionary variableValues;
protected IList variableNames;
protected double trainingToTestRatio;
protected PreprocessingData(PreprocessingData original, Cloner cloner)
: base(original, cloner) {
variableValues = CopyVariableValues(original.variableValues);
variableNames = new List(original.variableNames);
trainingToTestRatio = original.trainingToTestRatio;
}
public PreprocessingData(IDataAnalysisProblemData problemData)
: base() {
Name = "-";
variableNames = new List(problemData.Dataset.VariableNames);
// create dictionary from variable name to index
int columnIndex = 0;
variableValues = new Dictionary();
foreach (var variableName in problemData.Dataset.VariableNames) {
if (problemData.Dataset.IsType(variableName)) {
variableValues[columnIndex] = problemData.Dataset.GetDoubleValues(variableName).ToList();
} else if (problemData.Dataset.IsType(variableName)) {
variableValues[columnIndex] = CreateColumn(problemData.Dataset, columnIndex, x => x);
} else if (problemData.Dataset.IsType(variableName)) {
variableValues[columnIndex] = CreateColumn(problemData.Dataset, columnIndex, x => DateTime.Parse(x));
} else {
throw new ArgumentException("The datatype of column " + variableName + " must be of type List, List or List");
}
++columnIndex;
}
trainingToTestRatio = (double)problemData.TrainingPartition.Size / Math.Max(problemData.Dataset.Rows, double.Epsilon);
}
private static IList CreateColumn(Dataset ds, int column, Func selector) {
var list = new List(ds.Rows);
for (int row = 0; row < ds.Rows; ++row) {
list.Add(selector(ds.GetValue(row, column)));
}
return list;
}
protected IDictionary CopyVariableValues(IDictionary original) {
var copy = new Dictionary(variableValues);
for (int i = 0; i < original.Count; i++) {
variableValues[i] = (IList)Activator.CreateInstance(original[i].GetType(), original[i]);
}
return copy;
}
#region NamedItem abstract Member Implementations
public override IDeepCloneable Clone(Cloner cloner) {
return new PreprocessingData(this, cloner);
}
#endregion
#region IPreprocessingData Members
public T GetCell(int columnIndex, int rowIndex) {
return (T)variableValues[columnIndex][rowIndex];
}
public virtual void SetCell(int columnIndex, int rowIndex, T value) {
variableValues[columnIndex][rowIndex] = value;
}
public string GetCellAsString(int columnIndex, int rowIndex) {
return variableValues[columnIndex][rowIndex].ToString();
}
[Obsolete("use the index based variant, is faster")]
public IList GetValues(string variableName) {
return GetValues(GetColumnIndex(variableName));
}
public IList GetValues(int columnIndex) {
return (IList)variableValues[columnIndex];
}
public virtual void SetValues(int columnIndex, IList values) {
if (IsType(columnIndex)) {
variableValues[columnIndex] = (IList)values;
} else {
throw new ArgumentException("The datatype of column " + columnIndex + " must be of type " + variableValues[columnIndex].GetType().Name + " but was " + typeof(T).Name);
}
}
public virtual void InsertRow(int rowIndex) {
foreach (IList column in variableValues.Values) {
Type type = column.GetType().GetGenericArguments()[0];
column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null);
}
}
public virtual void DeleteRow(int rowIndex) {
foreach (IList column in variableValues.Values) {
column.RemoveAt(rowIndex);
}
}
public virtual void InsertColumn(string variableName, int columnIndex) {
variableValues.Add(columnIndex, new List(Rows));
variableNames.Insert(columnIndex, variableName);
}
public virtual void DeleteColumn(int columnIndex) {
variableValues.Remove(columnIndex);
variableNames.RemoveAt(columnIndex);
}
public IntRange TrainingPartition {
get { return new IntRange(0, (int)(Rows * trainingToTestRatio)); }
}
public IntRange TestPartition {
get { return new IntRange((int)(Rows * trainingToTestRatio), Rows); }
}
public string GetVariableName(int columnIndex) {
return variableNames[columnIndex];
}
public IEnumerable VariableNames {
get { return variableNames; }
}
public int GetColumnIndex(string variableName) {
return variableNames.IndexOf(variableName);
}
public bool IsType(int columnIndex) {
return variableValues[columnIndex] is List;
}
public int Columns {
get { return variableNames.Count; }
}
public int Rows {
get { return variableValues.Count > 0 ? variableValues[0].Count : 0; }
}
public Dataset ExportToDataset() {
IList values = new List();
for (int i = 0; i < Columns; ++i) {
values.Add(variableValues[i]);
}
var dataset = new Dataset(variableNames, values);
return dataset;
}
#endregion
}
}