Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingData.cs @ 10189

Last change on this file since 10189 was 10189, checked in by rstoll, 11 years ago

Added GetMissingValueIndices to IPreprocessingData since it will be used in many places
Removed columnIndex specific method from PreprocessingData

File size: 6.2 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Linq;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Problems.DataAnalysis;
29
30namespace HeuristicLab.DataPreprocessing {
31  [Item("PreprocessingData", "Represents data used for preprocessing.")]
32  public class PreprocessingData : NamedItem, IPreprocessingData {
33
34    private IDictionary<string, IList> variableValues;
35
36    private IList<string> variableNames;
37
38    private IDictionary<string, int> variableNameIndices;
39
40    private double trainingToTestRatio;
41
42    private PreprocessingData(PreprocessingData original, Cloner cloner)
43      : base(original, cloner) {
44      variableValues = new Dictionary<string, IList>(variableValues);
45      variableNameIndices = new Dictionary<string, int>(variableNameIndices);
46    }
47
48    public PreprocessingData(IDataAnalysisProblemData problemData)
49      : base() {
50      Name = "-";
51
52      variableNames = new List<string>(problemData.Dataset.VariableNames);
53      // create dictionary from variable name to index
54      variableNameIndices = new Dictionary<string, int>();
55      var variableNamesList = problemData.Dataset.VariableNames.ToList();
56      for (int i = 0; i < variableNamesList.Count; i++) {
57        variableNameIndices.Add(variableNamesList[i], i);
58      }
59
60      // copy values
61      variableValues = new Dictionary<string, IList>();
62      foreach (var variableName in problemData.Dataset.VariableNames) {
63        if (problemData.Dataset.IsType<double>(variableName)) {
64          variableValues[variableName] = problemData.Dataset.GetDoubleValues(variableName).ToList();
65        } else if (problemData.Dataset.IsType<string>(variableName)) {
66          variableValues[variableName] = CreateColumn<string>(problemData.Dataset, variableNameIndices[variableName], x => x);
67        } else if (problemData.Dataset.IsType<DateTime>(variableName)) {
68          variableValues[variableName] = CreateColumn<DateTime>(problemData.Dataset, variableNameIndices[variableName], x => DateTime.Parse(x));
69        } else {
70          throw new ArgumentException("The datatype of column " + variableName + " must be of type List<double>, List<string> or List<DateTime>");
71        }
72      }
73
74      trainingToTestRatio = (double)problemData.TrainingPartition.Size / problemData.TestPartition.Size;
75      Columns = problemData.Dataset.Columns;
76      Rows = problemData.Dataset.Rows;
77    }
78
79    private static IList CreateColumn<T>(Dataset ds, int column, Func<string, T> selector) {
80      var list = new List<T>(ds.Rows);
81      for (int row = 0; row < ds.Rows; row++) {
82        list[row] = selector(ds.GetValue(row, column));
83      }
84      return list;
85    }
86
87    #region NamedItem abstract Member Implementations
88
89    public override IDeepCloneable Clone(Cloner cloner) {
90      return new PreprocessingData(this, cloner);
91    }
92
93    #endregion
94
95    #region IPreprocessingData Members
96
97    public T GetCell<T>(string variableName, int row) {
98      return (T)variableValues[variableName][row];
99    }
100
101    public void SetCell<T>(string variableName, int row, T value) {
102      variableValues[variableName][row] = value;
103    }
104
105    public IEnumerable<T> GetValues<T>(string variableName) {
106      return (IEnumerable<T>)variableValues[variableName];
107    }
108
109    public void SetValues<T>(string variableName, IEnumerable<T> values) {
110      variableValues[variableName] = values.ToList();
111    }
112
113    public void InsertRow(int rowIndex) {
114      throw new NotImplementedException();
115    }
116
117    public void DeleteRow(int rowIndex) {
118      throw new NotImplementedException();
119    }
120
121    public void InsertColumn(string variableName, int columnIndex) {
122      throw new NotImplementedException();
123    }
124
125    public void DeleteColumn(string variableName) {
126      throw new NotImplementedException();
127    }
128
129    public IEnumerable<string> VariableNames {
130      get { return variableNames; }
131    }
132
133    public bool IsType<T>(string variableName) {
134      return variableValues[variableName] is List<T>;
135    }
136
137    public int Columns {
138      get;
139      private set;
140    }
141
142    public int Rows {
143      get;
144      private set;
145    }
146
147    public void ExportTo(IDataAnalysisProblemData problemData) {
148      throw new NotImplementedException();
149    }
150
151    public IDictionary<string, IEnumerable<int>> GetMissingValueIndices() {
152      var dic = new Dictionary<string, IEnumerable<int>>();
153      foreach (string variableName in VariableNames) {
154        dic.Add(variableName, GetMissingValueIndices(variableName));
155      }
156      return dic;
157    }
158
159    public IEnumerable<int> GetMissingValueIndices(string variableName) {
160      if (IsType<double>(variableName)) {
161        return GetValues<double>(variableName).Select((s, i) => new { i, s }).Where(t => double.IsNaN(t.s)).Select(t => t.i);
162      } else if (IsType<string>(variableName)) {
163        return GetValues<string>(variableName).Select((s, i) => new { i, s }).Where(t => string.IsNullOrEmpty(t.s)).Select(t => t.i);
164      } else if (IsType<DateTime>(variableName)) {
165        return GetValues<DateTime>(variableName).Select((s, i) => new { i, s }).Where(t => t.s.Equals(DateTime.MinValue)).Select(t => t.i);
166      } else {
167        throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
168      }
169    }
170
171    #endregion
172  }
173}
Note: See TracBrowser for help on using the repository browser.