Free cookie consent management tool by TermsFeed Policy Generator

source: branches/3.1/sources/HeuristicLab.DataAnalysis/Dataset.cs @ 12858

Last change on this file since 12858 was 534, checked in by gkronber, 16 years ago

fixed a tiny problem in the persistence of Datasets

File size: 12.2 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Xml;
25using HeuristicLab.Core;
26using HeuristicLab.Data;
27using System.Globalization;
28using System.Text;
29
30namespace HeuristicLab.DataAnalysis {
31  public sealed class Dataset : ItemBase {
32
33    private string name;
34    private double[] samples;
35    private int rows;
36    private int columns;
37    private Dictionary<int, Dictionary<int, double>>[] cachedMeans;
38    private Dictionary<int, Dictionary<int, double>>[] cachedRanges;
39    private double[] scalingFactor;
40    private double[] scalingOffset;
41
42    public string Name {
43      get { return name; }
44      set { name = value; }
45    }
46
47    public int Rows {
48      get { return rows; }
49      set { rows = value; }
50    }
51
52    public int Columns {
53      get { return columns; }
54      set { columns = value; }
55    }
56
57    public double[] ScalingFactor {
58      get { return scalingFactor; }
59    }
60    public double[] ScalingOffset {
61      get { return scalingOffset; }
62    }
63
64    public double GetValue(int i, int j) {
65      return samples[columns * i + j];
66    }
67
68    public void SetValue(int i, int j, double v) {
69      if(v != samples[columns * i + j]) {
70        samples[columns * i + j] = v;
71        CreateDictionaries();
72        FireChanged();
73      }
74    }
75
76    public double[] Samples {
77      get { return samples; }
78      set {
79        scalingFactor = new double[columns];
80        scalingOffset = new double[columns];
81        for(int i = 0; i < scalingFactor.Length; i++) {
82          scalingFactor[i] = 1.0;
83          scalingOffset[i] = 0.0;
84        }
85        samples = value;
86        CreateDictionaries();
87        FireChanged();
88      }
89    }
90
91    private string[] variableNames;
92    public string[] VariableNames {
93      get { return variableNames; }
94      set { variableNames = value; }
95    }
96
97    public Dataset() {
98      Name = "-";
99      VariableNames = new string[] { "Var0" };
100      Columns = 1;
101      Rows = 1;
102      Samples = new double[1];
103      scalingOffset = new double[] { 0.0 };
104      scalingFactor = new double[] { 1.0 };
105    }
106
107    private void CreateDictionaries() {
108      // keep a means and ranges dictionary for each column (possible target variable) of the dataset.
109      cachedMeans = new Dictionary<int, Dictionary<int, double>>[columns];
110      cachedRanges = new Dictionary<int, Dictionary<int, double>>[columns];
111      for(int i = 0; i < columns; i++) {
112        cachedMeans[i] = new Dictionary<int, Dictionary<int, double>>();
113        cachedRanges[i] = new Dictionary<int, Dictionary<int, double>>();
114      }
115    }
116
117    public override IView CreateView() {
118      return new DatasetView(this);
119    }
120
121    public override object Clone(IDictionary<Guid, object> clonedObjects) {
122      Dataset clone = new Dataset();
123      clonedObjects.Add(Guid, clone);
124      double[] cloneSamples = new double[rows * columns];
125      Array.Copy(samples, cloneSamples, samples.Length);
126      clone.rows = rows;
127      clone.columns = columns;
128      clone.Samples = cloneSamples;
129      clone.Name = Name;
130      clone.VariableNames = new string[VariableNames.Length];
131      Array.Copy(VariableNames, clone.VariableNames, VariableNames.Length);
132      Array.Copy(scalingFactor, clone.scalingFactor, columns);
133      Array.Copy(scalingOffset, clone.scalingOffset, columns);
134      return clone;
135    }
136
137    public override XmlNode GetXmlNode(string name, XmlDocument document, IDictionary<Guid, IStorable> persistedObjects) {
138      XmlNode node = base.GetXmlNode(name, document, persistedObjects);
139      XmlAttribute problemName = document.CreateAttribute("Name");
140      problemName.Value = Name;
141      node.Attributes.Append(problemName);
142      XmlAttribute dim1 = document.CreateAttribute("Dimension1");
143      dim1.Value = rows.ToString(CultureInfo.InvariantCulture.NumberFormat);
144      node.Attributes.Append(dim1);
145      XmlAttribute dim2 = document.CreateAttribute("Dimension2");
146      dim2.Value = columns.ToString(CultureInfo.InvariantCulture.NumberFormat);
147      node.Attributes.Append(dim2);
148      XmlAttribute variableNames = document.CreateAttribute("VariableNames");
149      variableNames.Value = GetVariableNamesString();
150      node.Attributes.Append(variableNames);
151      XmlAttribute scalingFactorsAttribute = document.CreateAttribute("ScalingFactors");
152      scalingFactorsAttribute.Value = GetString(scalingFactor);
153      node.Attributes.Append(scalingFactorsAttribute);
154      XmlAttribute scalingOffsetsAttribute = document.CreateAttribute("ScalingOffsets");
155      scalingOffsetsAttribute.Value = GetString(scalingOffset);
156      node.Attributes.Append(scalingOffsetsAttribute);
157      node.InnerText = ToString(CultureInfo.InvariantCulture.NumberFormat);
158      return node;
159    }
160
161    public override void Populate(XmlNode node, IDictionary<Guid, IStorable> restoredObjects) {
162      base.Populate(node, restoredObjects);
163      Name = node.Attributes["Name"].Value;
164      rows = int.Parse(node.Attributes["Dimension1"].Value, CultureInfo.InvariantCulture.NumberFormat);
165      columns = int.Parse(node.Attributes["Dimension2"].Value, CultureInfo.InvariantCulture.NumberFormat);
166
167      VariableNames = ParseVariableNamesString(node.Attributes["VariableNames"].Value);
168      if(node.Attributes["ScalingFactors"] != null)
169        scalingFactor = ParseDoubleString(node.Attributes["ScalingFactors"].Value);
170      else {
171        scalingFactor = new double[columns]; // compatibility with old serialization format
172        for(int i = 0; i < scalingFactor.Length; i++) scalingFactor[i] = 1.0;
173      }
174      if(node.Attributes["ScalingOffsets"] != null)
175        scalingOffset = ParseDoubleString(node.Attributes["ScalingOffsets"].Value);
176      else {
177        scalingOffset = new double[columns]; // compatibility with old serialization format
178        for(int i = 0; i < scalingOffset.Length; i++) scalingOffset[i] = 0.0;
179      }
180
181      string[] tokens = node.InnerText.Split(';');
182      if(tokens.Length != rows * columns) throw new FormatException();
183      samples = new double[rows * columns];
184      for(int row = 0; row < rows; row++) {
185        for(int column = 0; column < columns; column++) {
186          if(double.TryParse(tokens[row * columns + column], NumberStyles.Float, CultureInfo.InvariantCulture.NumberFormat, out samples[row * columns + column]) == false) {
187            throw new FormatException("Can't parse " + tokens[row * columns + column] + " as double value.");
188          }
189        }
190      }
191      CreateDictionaries();
192    }
193
194    public override string ToString() {
195      return ToString(CultureInfo.CurrentCulture.NumberFormat);
196    }
197
198    private string ToString(NumberFormatInfo format) {
199      StringBuilder builder = new StringBuilder();
200      for(int row = 0; row < rows; row++) {
201        for(int column = 0; column < columns; column++) {
202          builder.Append(";");
203          builder.Append(samples[row * columns + column].ToString("r", format));
204        }
205      }
206      if(builder.Length > 0) builder.Remove(0, 1);
207      return builder.ToString();
208    }
209
210    private string GetVariableNamesString() {
211      string s = "";
212      for(int i = 0; i < variableNames.Length; i++) {
213        s += variableNames[i] + "; ";
214      }
215
216      if(variableNames.Length > 0) {
217        s = s.TrimEnd(';', ' ');
218      }
219      return s;
220    }
221    private string GetString(double[] xs) {
222      string s = "";
223      for(int i = 0; i < xs.Length; i++) {
224        s += xs[i].ToString("r", CultureInfo.InvariantCulture) + "; ";
225      }
226
227      if(xs.Length > 0) {
228        s = s.TrimEnd(';', ' ');
229      }
230      return s;
231    }
232
233    private string[] ParseVariableNamesString(string p) {
234      p = p.Trim();
235      string[] tokens = p.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);
236      for(int i = 0; i < tokens.Length; i++) tokens[i] = tokens[i].Trim();
237      return tokens;
238    }
239    private double[] ParseDoubleString(string s) {
240      s = s.Trim();
241      string[] ss = s.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);
242      double[] xs = new double[ss.Length];
243      for(int i = 0; i < xs.Length; i++) {
244        xs[i] = double.Parse(ss[i], CultureInfo.InvariantCulture);
245      }
246      return xs;
247    }
248
249    public double GetMean(int column) {
250      return GetMean(column, 0, Rows - 1);
251    }
252
253    public double GetMean(int column, int from, int to) {
254      if(!cachedMeans[column].ContainsKey(from) || !cachedMeans[column][from].ContainsKey(to)) {
255        double[] values = new double[to - from + 1];
256        for(int sample = from; sample <= to; sample++) {
257          values[sample - from] = GetValue(sample, column);
258        }
259        double mean = Statistics.Mean(values);
260        if(!cachedMeans[column].ContainsKey(from)) cachedMeans[column][from] = new Dictionary<int, double>();
261        cachedMeans[column][from][to] = mean;
262        return mean;
263      } else {
264        return cachedMeans[column][from][to];
265      }
266    }
267
268    public double GetRange(int column) {
269      return GetRange(column, 0, Rows - 1);
270    }
271
272    public double GetRange(int column, int from, int to) {
273      if(!cachedRanges[column].ContainsKey(from) || !cachedRanges[column][from].ContainsKey(to)) {
274        double[] values = new double[to - from + 1];
275        for(int sample = from; sample <= to; sample++) {
276          values[sample - from] = GetValue(sample, column);
277        }
278        double range = Statistics.Range(values);
279        if(!cachedRanges[column].ContainsKey(from)) cachedRanges[column][from] = new Dictionary<int, double>();
280        cachedRanges[column][from][to] = range;
281        return range;
282      } else {
283        return cachedRanges[column][from][to];
284      }
285    }
286
287    public double GetMaximum(int column) {
288      double max = Double.NegativeInfinity;
289      for(int i = 0; i < Rows; i++) {
290        double val = GetValue(i, column);
291        if(val > max) max = val;
292      }
293      return max;
294    }
295
296    public double GetMinimum(int column) {
297      double min = Double.PositiveInfinity;
298      for(int i = 0; i < Rows; i++) {
299        double val = GetValue(i, column);
300        if(val < min) min = val;
301      }
302      return min;
303    }
304
305    internal void ScaleVariable(int column) {
306      if(scalingFactor[column] == 1.0 && scalingOffset[column] == 0.0) {
307        double min = GetMinimum(column);
308        double max = GetMaximum(column);
309        double range = max - min;
310        if(range == 0) ScaleVariable(column, 1.0, -min);
311        else ScaleVariable(column, 1.0 / range, -min);
312      }
313      CreateDictionaries();
314      FireChanged();
315    }
316
317    internal void ScaleVariable(int column, double factor, double offset) {
318      scalingFactor[column] = factor;
319      scalingOffset[column] = offset;
320      for(int i = 0; i < Rows; i++) {
321        double origValue = samples[i * columns + column];
322        samples[i * columns + column] = (origValue + offset) * factor;
323      }
324      CreateDictionaries();
325      FireChanged();
326    }
327
328    internal void UnscaleVariable(int column) {
329      if(scalingFactor[column] != 1.0 || scalingOffset[column]!=0.0) {
330        for(int i = 0; i < rows; i++) {
331          double scaledValue = samples[i * columns + column];
332          samples[i * columns + column] = scaledValue / scalingFactor[column] - scalingOffset[column];
333        }
334        scalingFactor[column] = 1.0;
335        scalingOffset[column] = 0.0;
336      }
337    }
338  }
339}
Note: See TracBrowser for help on using the repository browser.