Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingData.cs @ 10204

Last change on this file since 10204 was 10194, checked in by pfleck, 11 years ago

Implemented InsertRow, DeleteRow, InsertColumn, DeleteColumn in PreprocessingData.

File size: 7.2 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Linq;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Problems.DataAnalysis;
29
30namespace HeuristicLab.DataPreprocessing {
31  [Item("PreprocessingData", "Represents data used for preprocessing.")]
32  public class PreprocessingData : NamedItem, IPreprocessingData {
33
34    private IDictionary<string, IList> variableValues;
35
36    private IList<string> variableNames;
37
38    private IDictionary<string, int> variableNameIndices;
39
40    private double trainingToTestRatio;
41 
42    private PreprocessingData(PreprocessingData original, Cloner cloner)
43      : base(original, cloner) {
44      variableValues = new Dictionary<string, IList>(variableValues);
45      variableNameIndices = new Dictionary<string, int>(variableNameIndices);
46    }
47
48    public PreprocessingData(IDataAnalysisProblemData problemData)
49      : base() {
50      Name = "-";
51
52      variableNames = new List<string>(problemData.Dataset.VariableNames);
53      // create dictionary from variable name to index
54      variableNameIndices = new Dictionary<string, int>();
55      var variableNamesList = problemData.Dataset.VariableNames.ToList();
56      for (int i = 0; i < variableNamesList.Count; i++) {
57        variableNameIndices.Add(variableNamesList[i], i);
58      }
59
60      // copy values
61      variableValues = new Dictionary<string, IList>();
62      foreach (var variableName in problemData.Dataset.VariableNames) {
63        if (problemData.Dataset.IsType<double>(variableName)) {
64          variableValues[variableName] = problemData.Dataset.GetDoubleValues(variableName).ToList();
65        } else if (problemData.Dataset.IsType<string>(variableName)) {
66          variableValues[variableName] = CreateColumn<string>(problemData.Dataset, variableNameIndices[variableName], x => x);
67        } else if (problemData.Dataset.IsType<DateTime>(variableName)) {
68          variableValues[variableName] = CreateColumn<DateTime>(problemData.Dataset, variableNameIndices[variableName], x => DateTime.Parse(x));
69        } else {
70          throw new ArgumentException("The datatype of column " + variableName + " must be of type List<double>, List<string> or List<DateTime>");
71        }
72      }
73
74      trainingToTestRatio = (double)problemData.TrainingPartition.Size / problemData.TestPartition.Size;
75    }
76
77    private static IList CreateColumn<T>(Dataset ds, int column, Func<string, T> selector) {
78      var list = new List<T>(ds.Rows);
79      for (int row = 0; row < ds.Rows; row++) {
80        list[row] = selector(ds.GetValue(row, column));
81      }
82      return list;
83    }
84
85    #region NamedItem abstract Member Implementations
86
87    public override IDeepCloneable Clone(Cloner cloner) {
88      return new PreprocessingData(this, cloner);
89    }
90
91    #endregion
92
93    #region IPreprocessingData Members
94
95    public T GetCell<T>(string variableName, int row) {
96      return (T)variableValues[variableName][row];
97    }
98
99    public void SetCell<T>(string variableName, int row, T value) {
100      variableValues[variableName][row] = value;
101    }
102
103    public IEnumerable<T> GetValues<T>(string variableName) {
104      // TODO: test if cast is valid
105      return (IEnumerable<T>)variableValues[variableName];
106    }
107
108    public void SetValues<T>(string variableName, IEnumerable<T> values) {
109      variableValues[variableName] = values.ToList();
110    }
111
112    public void InsertRow(int rowIndex) {
113      foreach (IList column in variableValues.Values) {
114        Type type = column.GetType().GetGenericArguments()[0];
115
116        column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null);
117      }
118    }
119
120    public void DeleteRow(int rowIndex) {
121      foreach (IList column in variableValues.Values) {
122        column.RemoveAt(rowIndex);
123      }
124    }
125
126    public void InsertColumn<T>(string variableName, int columnIndex) {
127      variableValues.Add(variableName, new List<T>(Rows));
128      variableNameIndices.Add(variableName, columnIndex);
129      variableNames.Insert(columnIndex, variableName);
130    }
131
132    public void DeleteColumn(string variableName) {
133      variableValues.Remove(variableName);
134      variableNames.RemoveAt(variableNameIndices[variableName]);
135      variableNameIndices.Remove(variableName);
136    }
137
138    public IEnumerable<string> VariableNames {
139      get { return variableNames; }
140    }
141
142    public bool IsType<T>(string variableName) {
143      return variableValues[variableName] is List<T>;
144    }
145
146    public int Columns {
147      get { return variableNames.Count; }
148    }
149
150    public int Rows {
151      get { return variableValues.Count; }
152    }
153    public IDictionary<string, IEnumerable<int>> GetMissingValueIndices() {
154      var dic = new Dictionary<string, IEnumerable<int>>();
155      foreach (string variableName in VariableNames) {
156        dic.Add(variableName, GetMissingValueIndices(variableName));
157      }
158      return dic;
159    }
160
161    public bool IsMissingValue(string variableName, int rowIndex) {
162      if (IsType<double>(variableName)) {
163        return double.IsNaN(GetCell<double>(variableName, rowIndex));
164      } else if (IsType<string>(variableName)) {
165        return string.IsNullOrEmpty(GetCell<string>(variableName, rowIndex));
166      } else if (IsType<DateTime>(variableName)) {
167        return GetCell<DateTime>(variableName, rowIndex).Equals(DateTime.MinValue);
168      } else {
169        throw new ArgumentException("cell in column with variableName: " + variableName + " and row index " + rowIndex + " contains a non supported type.");
170      }
171    }
172
173    public IEnumerable<int> GetMissingValueIndices(string variableName) {
174      if (IsType<double>(variableName)) {
175        return GetValues<double>(variableName).Select((s, i) => new { i, s }).Where(t => double.IsNaN(t.s)).Select(t => t.i);
176      } else if (IsType<string>(variableName)) {
177        return GetValues<string>(variableName).Select((s, i) => new { i, s }).Where(t => string.IsNullOrEmpty(t.s)).Select(t => t.i);
178      } else if (IsType<DateTime>(variableName)) {
179          return GetValues<DateTime>(variableName).Select((s, i) => new { i, s }).Where(t => t.s.Equals(DateTime.MinValue)).Select(t => t.i);
180      } else {
181        throw new ArgumentException("column with variableName: " + variableName + " contains a non supported type.");
182      }
183    }
184
185    #endregion
186  }
187}
Note: See TracBrowser for help on using the repository browser.