Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingData.cs @ 10235

Last change on this file since 10235 was 10235, checked in by pfleck, 10 years ago

Fixed bug with partition range.

File size: 7.8 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Linq;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Data;
29using HeuristicLab.Problems.DataAnalysis;
30
31namespace HeuristicLab.DataPreprocessing {
32  [Item("PreprocessingData", "Represents data used for preprocessing.")]
33  public class PreprocessingData : NamedItem, IPreprocessingData {
34
35    private IDictionary<string, IList> variableValues;
36
37    private IList<string> variableNames;
38
39    private IDictionary<string, int> variableNameIndices;
40
41    private double trainingToTestRatio;
42
43    private PreprocessingData(PreprocessingData original, Cloner cloner)
44      : base(original, cloner) {
45      variableValues = new Dictionary<string, IList>(variableValues);
46      variableNameIndices = new Dictionary<string, int>(variableNameIndices);
47    }
48
49    public PreprocessingData(IDataAnalysisProblemData problemData)
50      : base() {
51      Name = "-";
52
53      variableNames = new List<string>(problemData.Dataset.VariableNames);
54      // create dictionary from variable name to index
55      variableNameIndices = new Dictionary<string, int>();
56      var variableNamesList = problemData.Dataset.VariableNames.ToList();
57      for (int i = 0; i < variableNamesList.Count; i++) {
58        variableNameIndices.Add(variableNamesList[i], i);
59      }
60
61      // copy values
62      variableValues = new Dictionary<string, IList>();
63      foreach (var variableName in problemData.Dataset.VariableNames) {
64        if (problemData.Dataset.IsType<double>(variableName)) {
65          variableValues[variableName] = problemData.Dataset.GetDoubleValues(variableName).ToList();
66        } else if (problemData.Dataset.IsType<string>(variableName)) {
67          variableValues[variableName] = CreateColumn<string>(problemData.Dataset, variableNameIndices[variableName], x => x);
68        } else if (problemData.Dataset.IsType<DateTime>(variableName)) {
69          variableValues[variableName] = CreateColumn<DateTime>(problemData.Dataset, variableNameIndices[variableName], x => DateTime.Parse(x));
70        } else {
71          throw new ArgumentException("The datatype of column " + variableName + " must be of type List<double>, List<string> or List<DateTime>");
72        }
73      }
74
75      trainingToTestRatio = (double)problemData.TrainingPartition.Size / Math.Max(problemData.Dataset.Rows, double.Epsilon);
76    }
77
78    private static IList CreateColumn<T>(Dataset ds, int column, Func<string, T> selector) {
79      var list = new List<T>(ds.Rows);
80      for (int row = 0; row < ds.Rows; row++) {
81        list[row] = selector(ds.GetValue(row, column));
82      }
83      return list;
84    }
85
86    #region NamedItem abstract Member Implementations
87
88    public override IDeepCloneable Clone(Cloner cloner) {
89      return new PreprocessingData(this, cloner);
90    }
91
92    #endregion
93
94    #region IPreprocessingData Members
95
96    public T GetCell<T>(string variableName, int row) {
97      return (T)variableValues[variableName][row];
98    }
99
100    public void SetCell<T>(string variableName, int row, T value) {
101      variableValues[variableName][row] = value;
102    }
103
104    public IEnumerable<T> GetValues<T>(string variableName) {
105      // TODO: test if cast is valid
106      return (IEnumerable<T>)variableValues[variableName];
107    }
108
109    public void SetValues<T>(string variableName, IEnumerable<T> values) {
110      variableValues[variableName] = values.ToList();
111    }
112
113    public void InsertRow(int rowIndex) {
114      foreach (IList column in variableValues.Values) {
115        Type type = column.GetType().GetGenericArguments()[0];
116
117        column.Insert(rowIndex, type.IsValueType ? Activator.CreateInstance(type) : null);
118      }
119    }
120
121    public void DeleteRow(int rowIndex) {
122      foreach (IList column in variableValues.Values) {
123        column.RemoveAt(rowIndex);
124      }
125    }
126
127    public void InsertColumn<T>(string variableName, int columnIndex) {
128      variableValues.Add(variableName, new List<T>(Rows));
129      variableNameIndices.Add(variableName, columnIndex);
130      variableNames.Insert(columnIndex, variableName);
131    }
132
133    public void DeleteColumn(string variableName) {
134      variableValues.Remove(variableName);
135      variableNames.RemoveAt(variableNameIndices[variableName]);
136      variableNameIndices.Remove(variableName);
137    }
138
139    public IntRange TrainingPartition {
140      get { return new IntRange(0, (int)(Rows * trainingToTestRatio)); }
141    }
142
143    public IntRange TestPartition {
144      get { return new IntRange((int)(Rows * trainingToTestRatio), Rows); }
145    }
146
147    public IEnumerable<string> VariableNames {
148      get { return variableNames; }
149    }
150
151    public bool IsType<T>(string variableName) {
152      return variableValues[variableName] is List<T>;
153    }
154
155    public int Columns {
156      get { return variableNames.Count; }
157    }
158
159    public int Rows {
160      get { return variableValues[variableNames[0]].Count; }
161    }
162    public IDictionary<string, IEnumerable<int>> GetMissingValueIndices() {
163      var dic = new Dictionary<string, IEnumerable<int>>();
164      foreach (string variableName in VariableNames) {
165        dic.Add(variableName, GetMissingValueIndices(variableName));
166      }
167      return dic;
168    }
169
170    public bool IsMissingValue(string variableName, int rowIndex) {
171      if (IsType<double>(variableName)) {
172        return double.IsNaN(GetCell<double>(variableName, rowIndex));
173      } else if (IsType<string>(variableName)) {
174        return string.IsNullOrEmpty(GetCell<string>(variableName, rowIndex));
175      } else if (IsType<DateTime>(variableName)) {
176        return GetCell<DateTime>(variableName, rowIndex).Equals(DateTime.MinValue);
177      } else {
178        throw new ArgumentException("cell in column with variableName: " + variableName + " and row index " + rowIndex + " contains a non supported type.");
179      }
180    }
181
182    public IEnumerable<int> GetMissingValueIndices(string variableName) {
183      if (IsType<double>(variableName)) {
184        return GetValues<double>(variableName).Select((s, i) => new { i, s }).Where(t => double.IsNaN(t.s)).Select(t => t.i);
185      } else if (IsType<string>(variableName)) {
186        return GetValues<string>(variableName).Select((s, i) => new { i, s }).Where(t => string.IsNullOrEmpty(t.s)).Select(t => t.i);
187      } else if (IsType<DateTime>(variableName)) {
188        return GetValues<DateTime>(variableName).Select((s, i) => new { i, s }).Where(t => t.s.Equals(DateTime.MinValue)).Select(t => t.i);
189      } else {
190        throw new ArgumentException("column with variableName: " + variableName + " contains a non supported type.");
191      }
192    }
193
194    #endregion
195
196    #region IPreprocessingData Members
197
198    public Dataset ExportToDataset() {
199      IList<IList> values = new List<IList>();
200      foreach (var variable in VariableNames) {
201        values.Add(variableValues[variable]);
202      }
203
204      var dataset = new Dataset(variableNames, values);
205      return dataset;
206    }
207
208    #endregion
209  }
210}
Note: See TracBrowser for help on using the repository browser.