Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataPreprocessing/HeuristicLab.DataPreprocessing/3.3/Implementations/PreprocessingData.cs @ 10192

Last change on this file since 10192 was 10192, checked in by mleitner, 10 years ago

Implement manipulations per attribute, replace by media, average, linear interpolated from prev and next value or random.

File size: 9.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Linq;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Problems.DataAnalysis;
29
30namespace HeuristicLab.DataPreprocessing {
31  [Item("PreprocessingData", "Represents data used for preprocessing.")]
32  public class PreprocessingData : NamedItem, IPreprocessingData {
33
34    private IDictionary<string, IList> variableValues;
35
36    private IList<string> variableNames;
37
38    private IDictionary<string, int> variableNameIndices;
39
40    private double trainingToTestRatio;
41    private StatisticInfo statisticInfo;
42
43    private PreprocessingData(PreprocessingData original, Cloner cloner)
44      : base(original, cloner) {
45      variableValues = new Dictionary<string, IList>(variableValues);
46      variableNameIndices = new Dictionary<string, int>(variableNameIndices);
47    }
48
49    public PreprocessingData(IDataAnalysisProblemData problemData)
50      : base() {
51      Name = "-";
52
53      variableNames = new List<string>(problemData.Dataset.VariableNames);
54      // create dictionary from variable name to index
55      variableNameIndices = new Dictionary<string, int>();
56      var variableNamesList = problemData.Dataset.VariableNames.ToList();
57      for (int i = 0; i < variableNamesList.Count; i++) {
58        variableNameIndices.Add(variableNamesList[i], i);
59      }
60
61      // copy values
62      variableValues = new Dictionary<string, IList>();
63      foreach (var variableName in problemData.Dataset.VariableNames) {
64        if (problemData.Dataset.IsType<double>(variableName)) {
65          variableValues[variableName] = problemData.Dataset.GetDoubleValues(variableName).ToList();
66        } else if (problemData.Dataset.IsType<string>(variableName)) {
67          variableValues[variableName] = CreateColumn<string>(problemData.Dataset, variableNameIndices[variableName], x => x);
68        } else if (problemData.Dataset.IsType<DateTime>(variableName)) {
69          variableValues[variableName] = CreateColumn<DateTime>(problemData.Dataset, variableNameIndices[variableName], x => DateTime.Parse(x));
70        } else {
71          throw new ArgumentException("The datatype of column " + variableName + " must be of type List<double>, List<string> or List<DateTime>");
72        }
73      }
74
75      trainingToTestRatio = (double)problemData.TrainingPartition.Size / problemData.TestPartition.Size;
76      Columns = problemData.Dataset.Columns;
77      Rows = problemData.Dataset.Rows;
78
79      statisticInfo = new StatisticInfo(this);
80    }
81
82    private static IList CreateColumn<T>(Dataset ds, int column, Func<string, T> selector) {
83      var list = new List<T>(ds.Rows);
84      for (int row = 0; row < ds.Rows; row++) {
85        list[row] = selector(ds.GetValue(row, column));
86      }
87      return list;
88    }
89
90    #region NamedItem abstract Member Implementations
91
92    public override IDeepCloneable Clone(Cloner cloner) {
93      return new PreprocessingData(this, cloner);
94    }
95
96    #endregion
97
98    #region IPreprocessingData Members
99
100    public T GetCell<T>(string variableName, int row) {
101      return (T)variableValues[variableName][row];
102    }
103
104    public void SetCell<T>(string variableName, int row, T value) {
105      variableValues[variableName][row] = value;
106    }
107
108    public IEnumerable<T> GetValues<T>(string variableName) {
109        return (IEnumerable<T>)variableValues[variableName];
110    }
111
112    public void SetValues<T>(string variableName, IEnumerable<T> values) {
113      variableValues[variableName] = values.ToList();
114    }
115
116    public void InsertRow(int rowIndex) {
117      throw new NotImplementedException();
118    }
119
120    public void DeleteRow(int rowIndex) {
121      throw new NotImplementedException();
122    }
123
124    public void InsertColumn(string variableName, int columnIndex) {
125      throw new NotImplementedException();
126    }
127
128    public void DeleteColumn(string variableName) {
129      throw new NotImplementedException();
130    }
131
132    public IEnumerable<string> VariableNames {
133      get { return variableNames; }
134    }
135
136    public bool IsType<T>(string variableName) {
137      return variableValues[variableName] is List<T>;
138    }
139
140    public int Columns {
141      get;
142      private set;
143    }
144
145    public int Rows {
146      get;
147      private set;
148    }
149
150    public void ExportTo(IDataAnalysisProblemData problemData) {
151      throw new NotImplementedException();
152    }
153
154    public IDictionary<string, IEnumerable<int>> GetMissingValueIndices() {
155      var dic = new Dictionary<string, IEnumerable<int>>();
156      foreach (string variableName in VariableNames) {
157        dic.Add(variableName, GetMissingValueIndices(variableName));
158      }
159      return dic;
160    }
161
162    public bool IsMissingValue(string variableName, int rowIndex) {
163      if (IsType<double>(variableName)) {
164        return double.IsNaN(GetCell<double>(variableName, rowIndex));
165      } else if (IsType<string>(variableName)) {
166        return string.IsNullOrEmpty(GetCell<string>(variableName, rowIndex));
167      } else if (IsType<DateTime>(variableName)) {
168        return GetCell<DateTime>(variableName, rowIndex).Equals(DateTime.MinValue);
169      } else {
170        throw new ArgumentException("cell in column with variableName: " + variableName + " and row index " + rowIndex + " contains a non supported type.");
171      }
172    }
173
174    public IEnumerable<int> GetMissingValueIndices(string variableName) {
175      if (IsType<double>(variableName)) {
176        return GetValues<double>(variableName).Select((s, i) => new { i, s }).Where(t => double.IsNaN(t.s)).Select(t => t.i);
177      } else if (IsType<string>(variableName)) {
178        return GetValues<string>(variableName).Select((s, i) => new { i, s }).Where(t => string.IsNullOrEmpty(t.s)).Select(t => t.i);
179      } else if (IsType<DateTime>(variableName)) {
180          return GetValues<DateTime>(variableName).Select((s, i) => new { i, s }).Where(t => t.s.Equals(DateTime.MinValue)).Select(t => t.i);
181      } else {
182        throw new ArgumentException("column with variableName: " + variableName + " contains a non supported type.");
183      }
184    }
185
186    public void ReplaceIndicesByValue<T>(string variableName, IEnumerable<int> indices, T value)
187    {
188        foreach (int index in indices)
189        {
190            SetCell<T>(variableName, index, value);
191        }
192    }
193
194    public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices) {
195        double average = statisticInfo.GetAverage(variableName);
196        ReplaceIndicesByValue<double>(variableName, indices, average);
197    }
198
199    public void ReplaceIndicesByAverageValue(string variableName, IEnumerable<int> indices) {
200        double median = statisticInfo.GetMedian(variableName);
201        ReplaceIndicesByValue<double>(variableName, indices, median);
202    }
203
204    public void ReplaceIndicesByRandomValue(string variableName, IEnumerable<int> indices) {
205        Random r = new Random();
206
207        double max = statisticInfo.GetMax<double>(variableName);
208        double min = statisticInfo.GetMin<double>(variableName);
209        double randMultiplier = (max - min);
210        foreach (int index in indices)
211        {
212            double rand = r.NextDouble() * randMultiplier + min;
213            SetCell<double>(variableName, index, rand);
214        }
215    }
216
217    public void ReplaceIndicesByLinearInterpolationOfNeighbours(string variableName, IEnumerable<int> indices) {
218        int countValues = GetValues<double>(variableName).Count();
219        foreach (int index in indices)
220        {
221            // dont replace first or last values
222            if (index > 0 && index < countValues) {
223               double prev = GetCell<double>(variableName, index - 1);
224               double next = GetCell<double>(variableName, index + 1);
225
226               double interpolated = (prev + next) / 2;
227
228               SetCell<double>(variableName, index, interpolated);
229            }
230        }
231    }
232
233    public void ReplaceIndicesByMostCommonValue(string variableName, IEnumerable<int> indices) {   
234      if (IsType<double>(variableName)) {
235        ReplaceIndicesByValue<double>(variableName, indices,statisticInfo.GetMostCommonValue<double>(variableName));
236      } else if (IsType<string>(variableName)) {
237        ReplaceIndicesByValue<string>(variableName, indices, statisticInfo.GetMostCommonValue<string>(variableName));
238      } else if (IsType<DateTime>(variableName)) {
239        ReplaceIndicesByValue<DateTime>(variableName, indices, statisticInfo.GetMostCommonValue<DateTime>(variableName));
240      } else {
241        throw new ArgumentException("column with index: " + variableName + " contains a non supported type.");
242      }
243    }
244
245    #endregion
246  }
247}
Note: See TracBrowser for help on using the repository browser.