Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/3.2/Dataset.cs @ 2498

Last change on this file since 2498 was 2375, checked in by gkronber, 15 years ago

Refactored CEDMA dispatcher and CEDMA server view to allow different modeling scenarios for each variable. #754

File size: 18.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Xml;
25using HeuristicLab.Core;
26using HeuristicLab.Data;
27using System.Globalization;
28using System.Text;
29using System.Linq;
30
31namespace HeuristicLab.DataAnalysis {
32  public sealed class Dataset : ItemBase {
33    private Dictionary<int, Dictionary<int, double>>[] cachedMeans;
34    private Dictionary<int, Dictionary<int, double>>[] cachedRanges;
35    private bool cachedValuesInvalidated = true;
36
37    public Dataset()
38      : this(new double[,] { { 0.0 } }) {
39    }
40
41    public Dataset(double[,] samples) {
42      Name = "-";
43      Rows = samples.GetLength(0);
44      Columns = samples.GetLength(1);
45      double[] values = new double[Rows * Columns];
46      int i = 0;
47      for (int row = 0; row < Rows; row++) {
48        for (int column = 0; column < columns; column++) {
49          values[i++] = samples[row, column];
50        }
51      }
52      Samples = values;
53      fireChangeEvents = true;
54    }
55
56    #region Properties
57    private string name;
58    public string Name {
59      get { return name; }
60      set { name = value; }
61    }
62
63    private int rows;
64    public int Rows {
65      get { return rows; }
66      set { rows = value; }
67    }
68
69    private int columns;
70    public int Columns {
71      get { return columns; }
72      set {
73        columns = value;
74        if (variableNames == null || variableNames.Length != columns) {
75          variableNames = new string[columns];
76        }
77      }
78    }
79
80    private string[] variableNames;
81    public IEnumerable<string> VariableNames {
82      get { return variableNames; }
83    }
84
85    private double[] samples;
86    public double[] Samples {
87      get { return samples; }
88      set {
89        variableNames = Enumerable.Range(1, columns).Select(x => "Var" + x.ToString("###")).ToArray();
90        scalingFactor = new double[columns];
91        scalingOffset = new double[columns];
92        for (int i = 0; i < scalingFactor.Length; i++) {
93          scalingFactor[i] = 1.0;
94          scalingOffset[i] = 0.0;
95        }
96        samples = value;
97        cachedValuesInvalidated = true;
98        if (fireChangeEvents) FireChanged();
99      }
100    }
101
102    private bool fireChangeEvents = true;
103    public bool FireChangeEvents {
104      get { return fireChangeEvents; }
105      set { fireChangeEvents = value; }
106    }
107
108    private double[] scalingFactor;
109    public double[] ScalingFactor {
110      get { return scalingFactor; }
111      set {
112        if (value.Length != scalingFactor.Length)
113          throw new ArgumentException("Length of scaling factor array doesn't match number of variables");
114        scalingFactor = value;
115      }
116    }
117
118    private double[] scalingOffset;
119    public double[] ScalingOffset {
120      get { return scalingOffset; }
121      set {
122        if (value.Length != scalingOffset.Length)
123          throw new ArgumentException("Length of scaling offset array doesn't match number of variables");
124        scalingOffset = value;
125      }
126    }
127    #endregion
128
129    #region Modify and get values
130    public double GetValue(int row, int column) {
131      return samples[columns * row + column];
132    }
133
134    public double[] GetVariableValues(int variableIndex, int start, int end) {
135      if (start < 0 || !(start <= end))
136        throw new ArgumentException("Start must be between 0 and end (" + end + ").");
137      if (end > rows || end < start)
138        throw new ArgumentException("End must be between start (" + start + ") and dataset rows (" + rows + ").");
139
140      double[] values = new double[end - start];
141      for (int i = 0; i < end - start; i++)
142        values[i] = GetValue(i + start, variableIndex);
143      return values;
144    }
145
146    public double[] GetVariableValues(int variableIndex) {
147      return GetVariableValues(variableIndex, 0, this.rows);
148    }
149
150    public double[] GetVariableValues(string variableName, int start, int end) {
151      return GetVariableValues(GetVariableIndex(variableName), start, end);
152    }
153
154    public double[] GetVariableValues(string variableName) {
155      return GetVariableValues(variableName, 0, this.rows);
156    }
157
158    public void SetValue(int i, int j, double v) {
159      if (v != samples[columns * i + j]) {
160        samples[columns * i + j] = v;
161        cachedValuesInvalidated = true;
162        if (fireChangeEvents) FireChanged();
163      }
164    }
165
166    public IEnumerable<double> ReplaceVariableValues(int variableIndex, IEnumerable<double> newValues, int start, int end) {
167      double[] oldValues = new double[end - start];
168      for (int i = 0; i < end - start; i++) oldValues[i] = this.GetValue(i + start, variableIndex);
169      if (newValues.Count() != end - start) throw new ArgumentException("The length of the new values sequence doesn't match the required length (number of replaced values)");
170
171      int index = start;
172      this.FireChangeEvents = false;
173      foreach (double v in newValues) {
174        this.SetValue(index++, variableIndex, v);
175      }
176      this.FireChangeEvents = true;
177      this.FireChanged();
178      return oldValues;
179    }
180
181    public IEnumerable<double> ReplaceVariableValues(string variableName, IEnumerable<double> newValues, int start, int end) {
182      return ReplaceVariableValues(this.GetVariableIndex(variableName), newValues, start, end);
183    }
184    #endregion
185
186    #region Variable name methods
187    public string GetVariableName(int variableIndex) {
188      return variableNames[variableIndex];
189    }
190
191    public int GetVariableIndex(string variableName) {
192      for (int i = 0; i < variableNames.Length; i++) {
193        if (variableNames[i].Equals(variableName)) return i;
194      }
195      throw new ArgumentException("The variable name " + variableName + " was not found.");
196    }
197
198    public void SetVariableName(int variableIndex, string name) {
199      variableNames[variableIndex] = name;
200      if (fireChangeEvents) FireChanged();
201    }
202
203    public bool ContainsVariableName(string variableName) {
204      return this.variableNames.Contains(variableName);
205    }
206    #endregion
207
208    public override IView CreateView() {
209      return new DatasetView(this);
210    }
211
212
213    #region Variable statistics
214    public double GetMean(string variableName) {
215      return GetMean(GetVariableIndex(variableName));
216    }
217
218    public double GetMean(string variableName, int start, int end) {
219      return GetMean(GetVariableIndex(variableName), start, end);
220    }
221
222    public double GetMean(int column) {
223      return GetMean(column, 0, Rows);
224    }
225
226    public double GetMean(int column, int start, int end) {
227      if (cachedValuesInvalidated) CreateDictionaries();
228      if (!cachedMeans[column].ContainsKey(start) || !cachedMeans[column][start].ContainsKey(end)) {
229        double[] values = new double[end - start];
230        for (int sample = start; sample < end; sample++) {
231          values[sample - start] = GetValue(sample, column);
232        }
233        double mean = Statistics.Mean(values);
234        if (!cachedMeans[column].ContainsKey(start)) cachedMeans[column][start] = new Dictionary<int, double>();
235        cachedMeans[column][start][end] = mean;
236        return mean;
237      } else {
238        return cachedMeans[column][start][end];
239      }
240    }
241
242    public double GetRange(string variableName) {
243      return GetRange(this.GetVariableIndex(variableName));
244    }
245
246    public double GetRange(int column) {
247      return GetRange(column, 0, Rows);
248    }
249
250    public double GetRange(string variableName, int start, int end) {
251      return GetRange(this.GetVariableIndex(variableName), start, end);
252    }
253
254    public double GetRange(int column, int start, int end) {
255      if (cachedValuesInvalidated) CreateDictionaries();
256      if (!cachedRanges[column].ContainsKey(start) || !cachedRanges[column][start].ContainsKey(end)) {
257        double[] values = new double[end - start];
258        for (int sample = start; sample < end; sample++) {
259          values[sample - start] = GetValue(sample, column);
260        }
261        double range = Statistics.Range(values);
262        if (!cachedRanges[column].ContainsKey(start)) cachedRanges[column][start] = new Dictionary<int, double>();
263        cachedRanges[column][start][end] = range;
264        return range;
265      } else {
266        return cachedRanges[column][start][end];
267      }
268    }
269
270    public double GetMaximum(string variableName) {
271      return GetMaximum(this.GetVariableIndex(variableName));
272    }
273
274    public double GetMaximum(int column) {
275      return GetMaximum(column, 0, Rows);
276    }
277
278    public double GetMaximum(string variableName, int start, int end) {
279      return GetMaximum(this.GetVariableIndex(variableName), start, end);
280    }
281
282    public double GetMaximum(int column, int start, int end) {
283      double max = Double.NegativeInfinity;
284      for (int i = start; i < end; i++) {
285        double val = GetValue(i, column);
286        if (!double.IsNaN(val) && val > max) max = val;
287      }
288      return max;
289    }
290
291    public double GetMinimum(string variableName) {
292      return GetMinimum(GetVariableIndex(variableName));
293    }
294
295    public double GetMinimum(int column) {
296      return GetMinimum(column, 0, Rows);
297    }
298
299    public double GetMinimum(string variableName, int start, int end) {
300      return GetMinimum(this.GetVariableIndex(variableName), start, end);
301    }
302
303    public double GetMinimum(int column, int start, int end) {
304      double min = Double.PositiveInfinity;
305      for (int i = start; i < end; i++) {
306        double val = GetValue(i, column);
307        if (!double.IsNaN(val) && val < min) min = val;
308      }
309      return min;
310    }
311
312    public int CountMissingValues(string variableName) {
313      return CountMissingValues(this.GetVariableIndex(variableName));
314    }
315    public int CountMissingValues(int column) {
316      return CountMissingValues(column, 0, Rows);
317    }
318
319    public int CountMissingValues(string variableName, int start, int end) {
320      return CountMissingValues(this.GetVariableIndex(variableName), start, end);
321    }
322
323    public int CountMissingValues(int column, int start, int end) {
324      int n = 0;
325      for (int i = start; i < end; i++) {
326        double val = GetValue(i, column);
327        if (double.IsNaN(val)) n++;
328      }
329      return n;
330    }
331
332    #endregion
333
334    internal void ScaleVariable(int column) {
335      if (scalingFactor[column] == 1.0 && scalingOffset[column] == 0.0) {
336        double min = GetMinimum(column);
337        double max = GetMaximum(column);
338        double range = max - min;
339        if (range == 0) ScaleVariable(column, 1.0, -min);
340        else ScaleVariable(column, 1.0 / range, -min);
341      }
342      cachedValuesInvalidated = true;
343      if (fireChangeEvents) FireChanged();
344    }
345
346    internal void ScaleVariable(int column, double factor, double offset) {
347      scalingFactor[column] = factor;
348      scalingOffset[column] = offset;
349      for (int i = 0; i < Rows; i++) {
350        double origValue = samples[i * columns + column];
351        samples[i * columns + column] = (origValue + offset) * factor;
352      }
353      cachedValuesInvalidated = true;
354      if (fireChangeEvents) FireChanged();
355    }
356
357    internal void UnscaleVariable(int column) {
358      if (scalingFactor[column] != 1.0 || scalingOffset[column] != 0.0) {
359        for (int i = 0; i < rows; i++) {
360          double scaledValue = samples[i * columns + column];
361          samples[i * columns + column] = scaledValue / scalingFactor[column] - scalingOffset[column];
362        }
363        scalingFactor[column] = 1.0;
364        scalingOffset[column] = 0.0;
365      }
366      cachedValuesInvalidated = true;
367      if (fireChangeEvents) FireChanged();
368    }
369
370    private void CreateDictionaries() {
371      // keep a means and ranges dictionary for each column (possible target variable) of the dataset.
372      cachedMeans = new Dictionary<int, Dictionary<int, double>>[columns];
373      cachedRanges = new Dictionary<int, Dictionary<int, double>>[columns];
374      for (int i = 0; i < columns; i++) {
375        cachedMeans[i] = new Dictionary<int, Dictionary<int, double>>();
376        cachedRanges[i] = new Dictionary<int, Dictionary<int, double>>();
377      }
378      cachedValuesInvalidated = false;
379    }
380
381    #region persistence
382    public override object Clone(IDictionary<Guid, object> clonedObjects) {
383      Dataset clone = new Dataset();
384      clonedObjects.Add(Guid, clone);
385      double[] cloneSamples = new double[rows * columns];
386      Array.Copy(samples, cloneSamples, samples.Length);
387      clone.rows = rows;
388      clone.columns = columns;
389      clone.Samples = cloneSamples;
390      clone.Name = Name;
391      clone.variableNames = new string[variableNames.Length];
392      Array.Copy(variableNames, clone.variableNames, variableNames.Length);
393      Array.Copy(scalingFactor, clone.scalingFactor, columns);
394      Array.Copy(scalingOffset, clone.scalingOffset, columns);
395      return clone;
396    }
397
398    public override XmlNode GetXmlNode(string name, XmlDocument document, IDictionary<Guid, IStorable> persistedObjects) {
399      XmlNode node = base.GetXmlNode(name, document, persistedObjects);
400      XmlAttribute problemName = document.CreateAttribute("Name");
401      problemName.Value = Name;
402      node.Attributes.Append(problemName);
403      XmlAttribute dim1 = document.CreateAttribute("Dimension1");
404      dim1.Value = rows.ToString(CultureInfo.InvariantCulture.NumberFormat);
405      node.Attributes.Append(dim1);
406      XmlAttribute dim2 = document.CreateAttribute("Dimension2");
407      dim2.Value = columns.ToString(CultureInfo.InvariantCulture.NumberFormat);
408      node.Attributes.Append(dim2);
409      XmlAttribute variableNames = document.CreateAttribute("VariableNames");
410      variableNames.Value = GetVariableNamesString();
411      node.Attributes.Append(variableNames);
412      XmlAttribute scalingFactorsAttribute = document.CreateAttribute("ScalingFactors");
413      scalingFactorsAttribute.Value = GetString(scalingFactor);
414      node.Attributes.Append(scalingFactorsAttribute);
415      XmlAttribute scalingOffsetsAttribute = document.CreateAttribute("ScalingOffsets");
416      scalingOffsetsAttribute.Value = GetString(scalingOffset);
417      node.Attributes.Append(scalingOffsetsAttribute);
418      node.InnerText = ToString(CultureInfo.InvariantCulture.NumberFormat);
419      return node;
420    }
421
422    public override void Populate(XmlNode node, IDictionary<Guid, IStorable> restoredObjects) {
423      base.Populate(node, restoredObjects);
424      Name = node.Attributes["Name"].Value;
425      rows = int.Parse(node.Attributes["Dimension1"].Value, CultureInfo.InvariantCulture.NumberFormat);
426      columns = int.Parse(node.Attributes["Dimension2"].Value, CultureInfo.InvariantCulture.NumberFormat);
427
428      variableNames = ParseVariableNamesString(node.Attributes["VariableNames"].Value);
429      if (node.Attributes["ScalingFactors"] != null)
430        scalingFactor = ParseDoubleString(node.Attributes["ScalingFactors"].Value);
431      else {
432        scalingFactor = new double[columns]; // compatibility with old serialization format
433        for (int i = 0; i < scalingFactor.Length; i++) scalingFactor[i] = 1.0;
434      }
435      if (node.Attributes["ScalingOffsets"] != null)
436        scalingOffset = ParseDoubleString(node.Attributes["ScalingOffsets"].Value);
437      else {
438        scalingOffset = new double[columns]; // compatibility with old serialization format
439        for (int i = 0; i < scalingOffset.Length; i++) scalingOffset[i] = 0.0;
440      }
441
442      string[] tokens = node.InnerText.Split(';');
443      if (tokens.Length != rows * columns) throw new FormatException();
444      samples = new double[rows * columns];
445      for (int row = 0; row < rows; row++) {
446        for (int column = 0; column < columns; column++) {
447          if (double.TryParse(tokens[row * columns + column], NumberStyles.Float, CultureInfo.InvariantCulture.NumberFormat, out samples[row * columns + column]) == false) {
448            throw new FormatException("Can't parse " + tokens[row * columns + column] + " as double value.");
449          }
450        }
451      }
452    }
453
454    public override string ToString() {
455      return ToString(CultureInfo.CurrentCulture.NumberFormat);
456    }
457
458    private string ToString(NumberFormatInfo format) {
459      StringBuilder builder = new StringBuilder();
460      for (int row = 0; row < rows; row++) {
461        for (int column = 0; column < columns; column++) {
462          builder.Append(";");
463          builder.Append(samples[row * columns + column].ToString("r", format));
464        }
465      }
466      if (builder.Length > 0) builder.Remove(0, 1);
467      return builder.ToString();
468    }
469
470    private string GetVariableNamesString() {
471      string s = "";
472      for (int i = 0; i < variableNames.Length; i++) {
473        s += variableNames[i] + "; ";
474      }
475
476      if (variableNames.Length > 0) {
477        s = s.TrimEnd(';', ' ');
478      }
479      return s;
480    }
481    private string GetString(double[] xs) {
482      string s = "";
483      for (int i = 0; i < xs.Length; i++) {
484        s += xs[i].ToString("r", CultureInfo.InvariantCulture) + "; ";
485      }
486
487      if (xs.Length > 0) {
488        s = s.TrimEnd(';', ' ');
489      }
490      return s;
491    }
492
493    private string[] ParseVariableNamesString(string p) {
494      p = p.Trim();
495      string[] tokens = p.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);
496      for (int i = 0; i < tokens.Length; i++) tokens[i] = tokens[i].Trim();
497      return tokens;
498    }
499    private double[] ParseDoubleString(string s) {
500      s = s.Trim();
501      string[] ss = s.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);
502      double[] xs = new double[ss.Length];
503      for (int i = 0; i < xs.Length; i++) {
504        xs[i] = double.Parse(ss[i], CultureInfo.InvariantCulture);
505      }
506      return xs;
507    }
508    #endregion
509  }
510}
Note: See TracBrowser for help on using the repository browser.