#region License Information
/* HeuristicLab
* Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using HeuristicLab.Common;
using HeuristicLab.Core;
using HeuristicLab.Data;
using HeuristicLab.Problems.DataAnalysis;
namespace HeuristicLab.DataPreprocessing {
[Item("PreprocessingData", "Represents data used for preprocessing.")]
public class PreprocessingData : NamedItem, IPreprocessingData {
private IDictionary variableValues;
private IList variableNames;
private IDictionary variableNameIndices;
private double trainingToTestRatio;
private PreprocessingData(PreprocessingData original, Cloner cloner)
: base(original, cloner) {
variableValues = new Dictionary(variableValues);
variableNameIndices = new Dictionary(variableNameIndices);
}
public PreprocessingData(IDataAnalysisProblemData problemData)
: base() {
Name = "-";
variableNames = new List(problemData.Dataset.VariableNames);
// create dictionary from variable name to index
variableNameIndices = new Dictionary();
var variableNamesList = problemData.Dataset.VariableNames.ToList();
for (int i = 0; i < variableNamesList.Count; i++) {
variableNameIndices.Add(variableNamesList[i], i);
}
// copy values
variableValues = new Dictionary();
foreach (var variableName in problemData.Dataset.VariableNames) {
if (problemData.Dataset.IsType(variableName)) {
variableValues[variableName] = problemData.Dataset.GetDoubleValues(variableName).ToList();
} else if (problemData.Dataset.IsType(variableName)) {
variableValues[variableName] = CreateColumn(problemData.Dataset, variableNameIndices[variableName], x => x);
} else if (problemData.Dataset.IsType(variableName)) {
variableValues[variableName] = CreateColumn(problemData.Dataset, variableNameIndices[variableName], x => DateTime.Parse(x));
} else {
throw new ArgumentException("The datatype of column " + variableName + " must be of type List, List or List");
}
}
trainingToTestRatio = (double)problemData.TrainingPartition.Size / Math.Max(problemData.Dataset.Rows, double.Epsilon);
}
private static IList CreateColumn(Dataset ds, int column, Func selector) {
var list = new List(ds.Rows);
for (int row = 0; row < ds.Rows; row++) {
list[row] = selector(ds.GetValue(row, column));
}
return list;
}
#region NamedItem abstract Member Implementations
public override IDeepCloneable Clone(Cloner cloner) {
return new PreprocessingData(this, cloner);
}
#endregion
#region IPreprocessingData Members
public T GetCell(string variableName, int row) {
return (T)variableValues[variableName][row];
}
public void SetCell(string variableName, int row, T value) {
variableValues[variableName][row] = value;
}
public string GetCellAsString(string variableName, int row) {
return variableValues[variableName][row].ToString();
}
public IEnumerable GetValues(string variableName) {
// TODO: test if cast is valid
return (IEnumerable)variableValues[variableName];
}
public void SetValues(string variableName, IEnumerable values) {
variableValues[variableName] = values.ToList();
}
public void InsertRow(int rowIndex) {
foreach (IList column in variableValues.Values) {
Type type = column.GetType().GetGenericArguments()[0];
column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null);
}
}
public void DeleteRow(int rowIndex) {
foreach (IList column in variableValues.Values) {
column.RemoveAt(rowIndex);
}
}
public void InsertColumn(string variableName, int columnIndex) {
variableValues.Add(variableName, new List(Rows));
variableNameIndices.Add(variableName, columnIndex);
variableNames.Insert(columnIndex, variableName);
}
public void DeleteColumn(string variableName) {
variableValues.Remove(variableName);
variableNames.RemoveAt(variableNameIndices[variableName]);
variableNameIndices.Remove(variableName);
}
public IntRange TrainingPartition {
get { return new IntRange(0, (int)(Rows * trainingToTestRatio)); }
}
public IntRange TestPartition {
get { return new IntRange((int)(Rows * trainingToTestRatio), Rows); }
}
public IList VariableNames {
get { return variableNames; }
}
public bool IsType(string variableName) {
return variableValues[variableName] is List;
}
public int Columns {
get { return variableNames.Count; }
}
public int Rows {
get { return variableValues[variableNames[0]].Count; }
}
public IDictionary> GetMissingValueIndices() {
var dic = new Dictionary>();
foreach (string variableName in VariableNames) {
dic.Add(variableName, GetMissingValueIndices(variableName));
}
return dic;
}
public bool IsMissingValue(string variableName, int rowIndex) {
if (IsType(variableName)) {
return double.IsNaN(GetCell(variableName, rowIndex));
} else if (IsType(variableName)) {
return string.IsNullOrEmpty(GetCell(variableName, rowIndex));
} else if (IsType(variableName)) {
return GetCell(variableName, rowIndex).Equals(DateTime.MinValue);
} else {
throw new ArgumentException("cell in column with variableName: " + variableName + " and row index " + rowIndex + " contains a non supported type.");
}
}
public IEnumerable GetMissingValueIndices(string variableName) {
if (IsType(variableName)) {
return GetValues(variableName).Select((s, i) => new { i, s }).Where(t => double.IsNaN(t.s)).Select(t => t.i);
} else if (IsType(variableName)) {
return GetValues(variableName).Select((s, i) => new { i, s }).Where(t => string.IsNullOrEmpty(t.s)).Select(t => t.i);
} else if (IsType(variableName)) {
return GetValues(variableName).Select((s, i) => new { i, s }).Where(t => t.s.Equals(DateTime.MinValue)).Select(t => t.i);
} else {
throw new ArgumentException("column with variableName: " + variableName + " contains a non supported type.");
}
}
#endregion
#region IPreprocessingData Members
public Dataset ExportToDataset() {
IList values = new List();
foreach (var variable in VariableNames) {
values.Add(variableValues[variable]);
}
var dataset = new Dataset(variableNames, values);
return dataset;
}
#endregion
}
}