Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingData.cs @ 10221

Last change on this file since 10221 was 10221, checked in by pfleck, 11 years ago
  • Cloned Algorithm and swapped Dataset an other members of the DataAnalysisProblemData
  • Refactored GetMostOuterContent
File size: 8.0 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Linq;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Data;
29using HeuristicLab.Problems.DataAnalysis;
30
31namespace HeuristicLab.DataPreprocessing {
32  [Item("PreprocessingData", "Represents data used for preprocessing.")]
33  public class PreprocessingData : NamedItem, IPreprocessingData {
34
35    private IDictionary<string, IList> variableValues;
36
37    private IList<string> variableNames;
38    private IntRange trainingPartition;
39    private IntRange testPartition;
40
41    private IDictionary<string, int> variableNameIndices;
42
43    private double trainingToTestRatio;
44
45    private PreprocessingData(PreprocessingData original, Cloner cloner)
46      : base(original, cloner) {
47      variableValues = new Dictionary<string, IList>(variableValues);
48      variableNameIndices = new Dictionary<string, int>(variableNameIndices);
49    }
50
51    public PreprocessingData(IDataAnalysisProblemData problemData)
52      : base() {
53      Name = "-";
54
55      variableNames = new List<string>(problemData.Dataset.VariableNames);
56      // create dictionary from variable name to index
57      variableNameIndices = new Dictionary<string, int>();
58      var variableNamesList = problemData.Dataset.VariableNames.ToList();
59      for (int i = 0; i < variableNamesList.Count; i++) {
60        variableNameIndices.Add(variableNamesList[i], i);
61      }
62
63      // copy values
64      variableValues = new Dictionary<string, IList>();
65      foreach (var variableName in problemData.Dataset.VariableNames) {
66        if (problemData.Dataset.IsType<double>(variableName)) {
67          variableValues[variableName] = problemData.Dataset.GetDoubleValues(variableName).ToList();
68        } else if (problemData.Dataset.IsType<string>(variableName)) {
69          variableValues[variableName] = CreateColumn<string>(problemData.Dataset, variableNameIndices[variableName], x => x);
70        } else if (problemData.Dataset.IsType<DateTime>(variableName)) {
71          variableValues[variableName] = CreateColumn<DateTime>(problemData.Dataset, variableNameIndices[variableName], x => DateTime.Parse(x));
72        } else {
73          throw new ArgumentException("The datatype of column " + variableName + " must be of type List<double>, List<string> or List<DateTime>");
74        }
75      }
76
77      trainingPartition = problemData.TrainingPartition;
78      testPartition = problemData.TestPartition;
79
80      trainingToTestRatio = (double)problemData.TrainingPartition.Size / problemData.TestPartition.Size;
81    }
82
83    private static IList CreateColumn<T>(Dataset ds, int column, Func<string, T> selector) {
84      var list = new List<T>(ds.Rows);
85      for (int row = 0; row < ds.Rows; row++) {
86        list[row] = selector(ds.GetValue(row, column));
87      }
88      return list;
89    }
90
91    #region NamedItem abstract Member Implementations
92
93    public override IDeepCloneable Clone(Cloner cloner) {
94      return new PreprocessingData(this, cloner);
95    }
96
97    #endregion
98
99    #region IPreprocessingData Members
100
101    public T GetCell<T>(string variableName, int row) {
102      return (T)variableValues[variableName][row];
103    }
104
105    public void SetCell<T>(string variableName, int row, T value) {
106      variableValues[variableName][row] = value;
107    }
108
109    public IEnumerable<T> GetValues<T>(string variableName) {
110      // TODO: test if cast is valid
111      return (IEnumerable<T>)variableValues[variableName];
112    }
113
114    public void SetValues<T>(string variableName, IEnumerable<T> values) {
115      variableValues[variableName] = values.ToList();
116    }
117
118    public void InsertRow(int rowIndex) {
119      foreach (IList column in variableValues.Values) {
120        Type type = column.GetType().GetGenericArguments()[0];
121
122        column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null);
123      }
124    }
125
126    public void DeleteRow(int rowIndex) {
127      foreach (IList column in variableValues.Values) {
128        column.RemoveAt(rowIndex);
129      }
130    }
131
132    public void InsertColumn<T>(string variableName, int columnIndex) {
133      variableValues.Add(variableName, new List<T>(Rows));
134      variableNameIndices.Add(variableName, columnIndex);
135      variableNames.Insert(columnIndex, variableName);
136    }
137
138    public void DeleteColumn(string variableName) {
139      variableValues.Remove(variableName);
140      variableNames.RemoveAt(variableNameIndices[variableName]);
141      variableNameIndices.Remove(variableName);
142    }
143
144    public IntRange TrainingPartition {
145      get { return new IntRange(0, (int)(Rows * trainingToTestRatio)); }
146    }
147
148    public IntRange TestPartition {
149      get { return new IntRange((int)(Rows * trainingToTestRatio), Math.Max(Rows - 1, 0)); }
150    }
151
152    public IEnumerable<string> VariableNames {
153      get { return variableNames; }
154    }
155
156    public bool IsType<T>(string variableName) {
157      return variableValues[variableName] is List<T>;
158    }
159
160    public int Columns {
161      get { return variableNames.Count; }
162    }
163
164    public int Rows {
165      get { return variableValues[variableNames[0]].Count; }
166    }
167    public IDictionary<string, IEnumerable<int>> GetMissingValueIndices() {
168      var dic = new Dictionary<string, IEnumerable<int>>();
169      foreach (string variableName in VariableNames) {
170        dic.Add(variableName, GetMissingValueIndices(variableName));
171      }
172      return dic;
173    }
174
175    public bool IsMissingValue(string variableName, int rowIndex) {
176      if (IsType<double>(variableName)) {
177        return double.IsNaN(GetCell<double>(variableName, rowIndex));
178      } else if (IsType<string>(variableName)) {
179        return string.IsNullOrEmpty(GetCell<string>(variableName, rowIndex));
180      } else if (IsType<DateTime>(variableName)) {
181        return GetCell<DateTime>(variableName, rowIndex).Equals(DateTime.MinValue);
182      } else {
183        throw new ArgumentException("cell in column with variableName: " + variableName + " and row index " + rowIndex + " contains a non supported type.");
184      }
185    }
186
187    public IEnumerable<int> GetMissingValueIndices(string variableName) {
188      if (IsType<double>(variableName)) {
189        return GetValues<double>(variableName).Select((s, i) => new { i, s }).Where(t => double.IsNaN(t.s)).Select(t => t.i);
190      } else if (IsType<string>(variableName)) {
191        return GetValues<string>(variableName).Select((s, i) => new { i, s }).Where(t => string.IsNullOrEmpty(t.s)).Select(t => t.i);
192      } else if (IsType<DateTime>(variableName)) {
193        return GetValues<DateTime>(variableName).Select((s, i) => new { i, s }).Where(t => t.s.Equals(DateTime.MinValue)).Select(t => t.i);
194      } else {
195        throw new ArgumentException("column with variableName: " + variableName + " contains a non supported type.");
196      }
197    }
198
199    #endregion
200
201    #region IPreprocessingData Members
202
203    public Dataset ExportToDataset() {
204      IList<IList> values = new List<IList>();
205      foreach (var variable in VariableNames) {
206        values.Add(variableValues[variable]);
207      }
208
209      var dataset = new Dataset(variableNames, values);
210      return dataset;
211    }
212
213    #endregion
214  }
215}
Note: See TracBrowser for help on using the repository browser.