Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/Dataset.cs @ 3264

Last change on this file since 3264 was 3264, checked in by gkronber, 14 years ago

Implemented import of CSV files for regression problems. #938 (Data types and operators for regression problems)

File size: 12.1 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Xml;
25using System.Globalization;
26using System.Text;
27using System.Linq;
28using HeuristicLab.Core;
29using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
30using HeuristicLab.Data;
31using HeuristicLab.Common;
32
33namespace HeuristicLab.Problems.DataAnalysis {
34  [Item("Dataset", "Represents a dataset containing data that should be analyzed.")]
35  [StorableClass]
36  public sealed class Dataset : NamedItem, IStringConvertibleMatrix {
37    private Dictionary<int, Dictionary<int, double>>[] cachedMeans;
38    private Dictionary<int, Dictionary<int, double>>[] cachedRanges;
39    private bool cachedValuesInvalidated = true;
40
41    public Dataset()
42      : this(new string[] { "x" }, new double[,] { { 0.0 } }) {
43    }
44
45    public Dataset(IEnumerable<string> variableNames, double[,] data)
46      : base() {
47      Name = "-";
48      if (variableNames.Count() != data.GetLength(1)) {
49        throw new ArgumentException("Number of variable names doesn't match the number of columns of data");
50      }
51      Data = new DoubleMatrix(data);
52      this.variableNames = new StringArray(variableNames.ToArray());
53    }
54
55    private StringArray variableNames;
56    public IEnumerable<string> VariableNames {
57      get { return variableNames; }
58    }
59
60    private DoubleMatrix data;
61    private DoubleMatrix Data {
62      get { return data; }
63      set {
64        if (data != value) {
65          if (value == null) throw new ArgumentNullException();
66          if (data != null) DeregisterDataEvents();
67          this.data = value;
68          RegisterDataEvents();
69          OnReset(EventArgs.Empty);
70        }
71      }
72    }
73
74    private void RegisterDataEvents() {
75      data.Reset += new EventHandler(data_Reset);
76      data.ItemChanged += new EventHandler<EventArgs<int, int>>(data_ItemChanged);
77    }
78
79    private void DeregisterDataEvents() {
80      data.Reset -= new EventHandler(data_Reset);
81      data.ItemChanged -= new EventHandler<EventArgs<int, int>>(data_ItemChanged);
82    }
83    // elementwise access
84    public double this[int rowIndex, int columnIndex] {
85      get { return data[rowIndex, columnIndex]; }
86      set {
87        if (!value.Equals(data[rowIndex, columnIndex])) {
88          data[rowIndex, columnIndex] = value;
89          OnDataChanged(new EventArgs<int, int>(rowIndex, columnIndex));
90        }
91      }
92    }
93    // access to full columns
94    public double[] this[string variableName] {
95      get { return VariableValues(VariableIndex(variableName), 0, data.Rows); }
96    }
97
98    public double[] VariableValues(int variableIndex, int start, int end) {
99      if (start < 0 || !(start <= end))
100        throw new ArgumentException("Start must be between 0 and end (" + end + ").");
101      if (end > data.Rows || end < start)
102        throw new ArgumentException("End must be between start (" + start + ") and dataset rows (" + data.Rows + ").");
103
104      double[] values = new double[end - start];
105      for (int i = 0; i < end - start; i++)
106        values[i] = data[i + start, variableIndex];
107      return values;
108    }
109
110    public double[] VariableValues(string variableName, int start, int end) {
111      return VariableValues(VariableIndex(variableName), start, end);
112    }
113
114    #region Variable name methods
115    public string VariableName(int variableIndex) {
116      return variableNames[variableIndex];
117    }
118
119    public int VariableIndex(string variableName) {
120      for (int i = 0; i < variableNames.Length; i++) {
121        if (variableNames[i].Equals(variableName)) return i;
122      }
123      throw new ArgumentException("The variable name " + variableName + " was not found.");
124    }
125
126    public void SetVariableName(int variableIndex, string name) {
127      variableNames[variableIndex] = name;
128    }
129
130    #endregion
131
132    #region variable statistics
133    public double Mean(string variableName) {
134      return Mean(VariableIndex(variableName));
135    }
136
137    public double Mean(string variableName, int start, int end) {
138      return Mean(VariableIndex(variableName), start, end);
139    }
140
141    public double Mean(int variableIndex) {
142      return Mean(variableIndex, 0, data.Rows);
143    }
144
145    public double Mean(int variableIndex, int start, int end) {
146      if (cachedValuesInvalidated) CreateDictionaries();
147      if (!cachedMeans[variableIndex].ContainsKey(start) || !cachedMeans[variableIndex][start].ContainsKey(end)) {
148        double mean = VariableValues(variableIndex, start, end).Average();
149        if (!cachedMeans[variableIndex].ContainsKey(start)) cachedMeans[variableIndex][start] = new Dictionary<int, double>();
150        cachedMeans[variableIndex][start][end] = mean;
151        return mean;
152      } else {
153        return cachedMeans[variableIndex][start][end];
154      }
155    }
156
157    public double Range(string variableName) {
158      return Range(VariableIndex(variableName));
159    }
160
161    public double Range(int variableIndex) {
162      return Range(variableIndex, 0, data.Rows);
163    }
164
165    public double Range(string variableName, int start, int end) {
166      return Range(VariableIndex(variableName), start, end);
167    }
168
169    public double Range(int variableIndex, int start, int end) {
170      if (cachedValuesInvalidated) CreateDictionaries();
171      if (!cachedRanges[variableIndex].ContainsKey(start) || !cachedRanges[variableIndex][start].ContainsKey(end)) {
172        var values = VariableValues(variableIndex, start, end);
173        double range = values.Max() - values.Min();
174        if (!cachedRanges[variableIndex].ContainsKey(start)) cachedRanges[variableIndex][start] = new Dictionary<int, double>();
175        cachedRanges[variableIndex][start][end] = range;
176        return range;
177      } else {
178        return cachedRanges[variableIndex][start][end];
179      }
180    }
181
182    public double Max(string variableName) {
183      return Max(VariableIndex(variableName));
184    }
185
186    public double Max(int variableIndex) {
187      return Max(variableIndex, 0, data.Rows);
188    }
189
190    public double Max(string variableName, int start, int end) {
191      return Max(VariableIndex(variableName), start, end);
192    }
193
194    public double Max(int variableIndex, int start, int end) {
195      return VariableValues(variableIndex, start, end).Max();
196    }
197
198    public double Min(string variableName) {
199      return Min(VariableIndex(variableName));
200    }
201
202    public double Min(int variableIndex) {
203      return Min(variableIndex, 0, data.Rows);
204    }
205
206    public double Min(string variableName, int start, int end) {
207      return Min(VariableIndex(variableName), start, end);
208    }
209
210    public double Min(int variableIndex, int start, int end) {
211      return VariableValues(variableIndex, start, end).Min();
212    }
213
214    public int MissingValues(string variableName) {
215      return MissingValues(VariableIndex(variableName));
216    }
217    public int MissingValues(int variableIndex) {
218      return MissingValues(variableIndex, 0, data.Rows);
219    }
220
221    public int MissingValues(string variableName, int start, int end) {
222      return MissingValues(VariableIndex(variableName), start, end);
223    }
224
225    public int MissingValues(int variableIndex, int start, int end) {
226      return VariableValues(variableIndex, start, end).Count(x => double.IsNaN(x));
227    }
228
229    #endregion
230
231    private void CreateDictionaries() {
232      // keep a means and ranges dictionary for each column (possible target variable) of the dataset.
233      cachedMeans = new Dictionary<int, Dictionary<int, double>>[data.Columns];
234      cachedRanges = new Dictionary<int, Dictionary<int, double>>[data.Columns];
235      for (int i = 0; i < data.Columns; i++) {
236        cachedMeans[i] = new Dictionary<int, Dictionary<int, double>>();
237        cachedRanges[i] = new Dictionary<int, Dictionary<int, double>>();
238      }
239      cachedValuesInvalidated = false;
240    }
241
242    public override IDeepCloneable Clone(Cloner cloner) {
243      Dataset clone = (Dataset)base.Clone(cloner);
244      clone.data = (DoubleMatrix)data.Clone(cloner);
245      clone.variableNames = (StringArray)variableNames.Clone(cloner);
246      return clone;
247    }
248
249    #region events
250    public event EventHandler<EventArgs<int, int>> DataChanged;
251    private void OnDataChanged(EventArgs<int, int> e) {
252      cachedValuesInvalidated = true;
253
254      var listeners = DataChanged;
255      if (listeners != null) listeners(this, e);
256    }
257    public event EventHandler Reset;
258    private void OnReset(EventArgs e) {
259      cachedValuesInvalidated = true;
260
261      var listeners = Reset;
262      if (listeners != null) listeners(this, e);
263    }
264
265    private void data_ItemChanged(object sender, EventArgs<int, int> e) {
266      OnDataChanged(e);
267    }
268
269    private void data_Reset(object sender, EventArgs e) {
270      OnReset(e);
271    }
272    #endregion
273
274    #region IStringConvertibleMatrix Members
275
276    public int Rows {
277      get {
278        return data.Rows + 1;
279      }
280      set {
281        if (value == 0) throw new ArgumentException("Number of rows must be at least one (for variable names)");
282        if (value - 1 != data.Rows) {
283          var newValues = new double[value - 1, data.Columns];
284          for (int row = 0; row < Math.Min(data.Rows, value - 1); row++) {
285            for (int column = 0; column < data.Columns; column++) {
286              newValues[row, column] = data[row, column];
287            }
288          }
289          Data = new DoubleMatrix(newValues);
290        }
291      }
292    }
293
294    public int Columns {
295      get {
296        return data.Columns;
297      }
298      set {
299        if (value != data.Columns) {
300          var newValues = new double[data.Rows, value];
301          var newVariableNames = new string[value];
302          for (int row = 0; row < data.Rows; row++) {
303            for (int column = 0; column < Math.Min(value, data.Columns); column++) {
304              newValues[row, column] = data[row, column];
305            }
306          }
307          string formatString = new StringBuilder().Append('#', (int)Math.Log10(value) + 1).ToString(); // >= 100 variables => ###
308          for (int column = 0; column < value; column++) {
309            if (column < data.Columns)
310              newVariableNames[column] = variableNames[column];
311            else
312              newVariableNames[column] = "Var" + column.ToString(formatString);
313          }
314          variableNames = new StringArray(newVariableNames);
315          Data = new DoubleMatrix(newValues);
316        }
317      }
318    }
319
320    public bool Validate(string value, out string errorMessage) {
321      errorMessage = string.Empty;
322      return true;
323    }
324
325    public string GetValue(int rowIndex, int columnIndex) {
326      if (rowIndex == 0) {
327        // return variable name
328        return variableNames[columnIndex];
329      } else {
330        return data[rowIndex - 1, columnIndex].ToString();
331      }
332    }
333
334    public bool SetValue(string value, int rowIndex, int columnIndex) {
335      if (rowIndex == 0) {
336        // set variable name
337        variableNames[columnIndex] = value;
338        return true;
339      } else {
340        double v;
341        if (double.TryParse(value, out v)) {
342          data[rowIndex - 1, columnIndex] = v;
343          return true;
344        } else return false;
345      }
346    }
347
348    public event EventHandler<EventArgs<int, int>> ItemChanged;
349
350    #endregion
351  }
352}
Note: See TracBrowser for help on using the repository browser.