Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/3.2/Dataset.cs @ 2301

Last change on this file since 2301 was 2285, checked in by gkronber, 15 years ago

Worked on #722 (IModel should provide a Predict() method to get predicted values for an input vector).
At the same time removed parameter PunishmentFactor from GP algorithms (this parameter is internal to TreeEvaluators now).

File size: 14.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Xml;
25using HeuristicLab.Core;
26using HeuristicLab.Data;
27using System.Globalization;
28using System.Text;
29using System.Linq;
30
31namespace HeuristicLab.DataAnalysis {
32  public sealed class Dataset : ItemBase {
33
34    private string name;
35    private double[] samples;
36    private int rows;
37    private int columns;
38    private Dictionary<int, Dictionary<int, double>>[] cachedMeans;
39    private Dictionary<int, Dictionary<int, double>>[] cachedRanges;
40    private double[] scalingFactor;
41    private double[] scalingOffset;
42    private bool cachedValuesInvalidated = true;
43
44    private bool fireChangeEvents = true;
45    public bool FireChangeEvents {
46      get { return fireChangeEvents; }
47      set { fireChangeEvents = value; }
48    }
49
50    public string Name {
51      get { return name; }
52      set { name = value; }
53    }
54
55    public int Rows {
56      get { return rows; }
57      set { rows = value; }
58    }
59
60    public int Columns {
61      get { return columns; }
62      set {
63        columns = value;
64        if (variableNames == null || variableNames.Length != columns) {
65          variableNames = new string[columns];
66        }
67      }
68    }
69
70    public double[] ScalingFactor {
71      get { return scalingFactor; }
72      set {
73        if (value.Length != scalingFactor.Length)
74          throw new ArgumentException("Length of scaling factor array doesn't match number of variables");
75        scalingFactor = value;
76      }
77    }
78    public double[] ScalingOffset {
79      get { return scalingOffset; }
80      set {
81        if (value.Length != scalingOffset.Length)
82          throw new ArgumentException("Length of scaling offset array doesn't match number of variables");
83        scalingOffset = value;
84      }
85    }
86
87    public double GetValue(int i, int j) {
88      return samples[columns * i + j];
89    }
90
91    public void SetValue(int i, int j, double v) {
92      if (v != samples[columns * i + j]) {
93        samples[columns * i + j] = v;
94        cachedValuesInvalidated = true;
95        if (fireChangeEvents) FireChanged();
96      }
97    }
98
99    public double[] Samples {
100      get { return samples; }
101      set {
102        variableNames = Enumerable.Range(1, columns).Select(x => "Var" + x.ToString("###")).ToArray();
103        scalingFactor = new double[columns];
104        scalingOffset = new double[columns];
105        for (int i = 0; i < scalingFactor.Length; i++) {
106          scalingFactor[i] = 1.0;
107          scalingOffset[i] = 0.0;
108        }
109        samples = value;
110        cachedValuesInvalidated = true;
111        if (fireChangeEvents) FireChanged();
112      }
113    }
114
115    private string[] variableNames;
116    public IEnumerable<string> VariableNames {
117      get { return variableNames; }
118    }
119
120    public Dataset()
121      : this(new double[,] { { 0.0 } }) {
122    }
123
124    public Dataset(double[,] samples) {
125      Name = "-";
126      Rows = samples.GetLength(0);
127      Columns = samples.GetLength(1);
128      double[] values = new double[Rows * Columns];
129      int i = 0;
130      for (int row = 0; row < Rows; row++) {
131        for (int column = 0; column < columns; column++) {
132          values[i++] = samples[row, column];
133        }
134      }
135      Samples = values;
136      fireChangeEvents = true;
137    }
138
139
140    public string GetVariableName(int variableIndex) {
141      return variableNames[variableIndex];
142    }
143
144    public int GetVariableIndex(string variableName) {
145      for (int i = 0; i < variableNames.Length; i++) {
146        if (variableNames[i].Equals(variableName)) return i;
147      }
148      throw new ArgumentException("The variable name " + variableName + " was not found.");
149    }
150
151    public void SetVariableName(int variableIndex, string name) {
152      variableNames[variableIndex] = name;
153    }
154
155    public override IView CreateView() {
156      return new DatasetView(this);
157    }
158
159    #region persistence
160    public override object Clone(IDictionary<Guid, object> clonedObjects) {
161      Dataset clone = new Dataset();
162      clonedObjects.Add(Guid, clone);
163      double[] cloneSamples = new double[rows * columns];
164      Array.Copy(samples, cloneSamples, samples.Length);
165      clone.rows = rows;
166      clone.columns = columns;
167      clone.Samples = cloneSamples;
168      clone.Name = Name;
169      clone.variableNames = new string[variableNames.Length];
170      Array.Copy(variableNames, clone.variableNames, variableNames.Length);
171      Array.Copy(scalingFactor, clone.scalingFactor, columns);
172      Array.Copy(scalingOffset, clone.scalingOffset, columns);
173      return clone;
174    }
175
176    public override XmlNode GetXmlNode(string name, XmlDocument document, IDictionary<Guid, IStorable> persistedObjects) {
177      XmlNode node = base.GetXmlNode(name, document, persistedObjects);
178      XmlAttribute problemName = document.CreateAttribute("Name");
179      problemName.Value = Name;
180      node.Attributes.Append(problemName);
181      XmlAttribute dim1 = document.CreateAttribute("Dimension1");
182      dim1.Value = rows.ToString(CultureInfo.InvariantCulture.NumberFormat);
183      node.Attributes.Append(dim1);
184      XmlAttribute dim2 = document.CreateAttribute("Dimension2");
185      dim2.Value = columns.ToString(CultureInfo.InvariantCulture.NumberFormat);
186      node.Attributes.Append(dim2);
187      XmlAttribute variableNames = document.CreateAttribute("VariableNames");
188      variableNames.Value = GetVariableNamesString();
189      node.Attributes.Append(variableNames);
190      XmlAttribute scalingFactorsAttribute = document.CreateAttribute("ScalingFactors");
191      scalingFactorsAttribute.Value = GetString(scalingFactor);
192      node.Attributes.Append(scalingFactorsAttribute);
193      XmlAttribute scalingOffsetsAttribute = document.CreateAttribute("ScalingOffsets");
194      scalingOffsetsAttribute.Value = GetString(scalingOffset);
195      node.Attributes.Append(scalingOffsetsAttribute);
196      node.InnerText = ToString(CultureInfo.InvariantCulture.NumberFormat);
197      return node;
198    }
199
200    public override void Populate(XmlNode node, IDictionary<Guid, IStorable> restoredObjects) {
201      base.Populate(node, restoredObjects);
202      Name = node.Attributes["Name"].Value;
203      rows = int.Parse(node.Attributes["Dimension1"].Value, CultureInfo.InvariantCulture.NumberFormat);
204      columns = int.Parse(node.Attributes["Dimension2"].Value, CultureInfo.InvariantCulture.NumberFormat);
205
206      variableNames = ParseVariableNamesString(node.Attributes["VariableNames"].Value);
207      if (node.Attributes["ScalingFactors"] != null)
208        scalingFactor = ParseDoubleString(node.Attributes["ScalingFactors"].Value);
209      else {
210        scalingFactor = new double[columns]; // compatibility with old serialization format
211        for (int i = 0; i < scalingFactor.Length; i++) scalingFactor[i] = 1.0;
212      }
213      if (node.Attributes["ScalingOffsets"] != null)
214        scalingOffset = ParseDoubleString(node.Attributes["ScalingOffsets"].Value);
215      else {
216        scalingOffset = new double[columns]; // compatibility with old serialization format
217        for (int i = 0; i < scalingOffset.Length; i++) scalingOffset[i] = 0.0;
218      }
219
220      string[] tokens = node.InnerText.Split(';');
221      if (tokens.Length != rows * columns) throw new FormatException();
222      samples = new double[rows * columns];
223      for (int row = 0; row < rows; row++) {
224        for (int column = 0; column < columns; column++) {
225          if (double.TryParse(tokens[row * columns + column], NumberStyles.Float, CultureInfo.InvariantCulture.NumberFormat, out samples[row * columns + column]) == false) {
226            throw new FormatException("Can't parse " + tokens[row * columns + column] + " as double value.");
227          }
228        }
229      }
230    }
231
232    public override string ToString() {
233      return ToString(CultureInfo.CurrentCulture.NumberFormat);
234    }
235
236    private string ToString(NumberFormatInfo format) {
237      StringBuilder builder = new StringBuilder();
238      for (int row = 0; row < rows; row++) {
239        for (int column = 0; column < columns; column++) {
240          builder.Append(";");
241          builder.Append(samples[row * columns + column].ToString("r", format));
242        }
243      }
244      if (builder.Length > 0) builder.Remove(0, 1);
245      return builder.ToString();
246    }
247
248    private string GetVariableNamesString() {
249      string s = "";
250      for (int i = 0; i < variableNames.Length; i++) {
251        s += variableNames[i] + "; ";
252      }
253
254      if (variableNames.Length > 0) {
255        s = s.TrimEnd(';', ' ');
256      }
257      return s;
258    }
259    private string GetString(double[] xs) {
260      string s = "";
261      for (int i = 0; i < xs.Length; i++) {
262        s += xs[i].ToString("r", CultureInfo.InvariantCulture) + "; ";
263      }
264
265      if (xs.Length > 0) {
266        s = s.TrimEnd(';', ' ');
267      }
268      return s;
269    }
270
271    private string[] ParseVariableNamesString(string p) {
272      p = p.Trim();
273      string[] tokens = p.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);
274      for (int i = 0; i < tokens.Length; i++) tokens[i] = tokens[i].Trim();
275      return tokens;
276    }
277    private double[] ParseDoubleString(string s) {
278      s = s.Trim();
279      string[] ss = s.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);
280      double[] xs = new double[ss.Length];
281      for (int i = 0; i < xs.Length; i++) {
282        xs[i] = double.Parse(ss[i], CultureInfo.InvariantCulture);
283      }
284      return xs;
285    }
286    #endregion
287
288    public double GetMean(int column) {
289      return GetMean(column, 0, Rows);
290    }
291
292    public double GetMean(int column, int from, int to) {
293      if (cachedValuesInvalidated) CreateDictionaries();
294      if (!cachedMeans[column].ContainsKey(from) || !cachedMeans[column][from].ContainsKey(to)) {
295        double[] values = new double[to - from];
296        for (int sample = from; sample < to; sample++) {
297          values[sample - from] = GetValue(sample, column);
298        }
299        double mean = Statistics.Mean(values);
300        if (!cachedMeans[column].ContainsKey(from)) cachedMeans[column][from] = new Dictionary<int, double>();
301        cachedMeans[column][from][to] = mean;
302        return mean;
303      } else {
304        return cachedMeans[column][from][to];
305      }
306    }
307
308    public double GetRange(int column) {
309      return GetRange(column, 0, Rows);
310    }
311
312    public double GetRange(int column, int from, int to) {
313      if (cachedValuesInvalidated) CreateDictionaries();
314      if (!cachedRanges[column].ContainsKey(from) || !cachedRanges[column][from].ContainsKey(to)) {
315        double[] values = new double[to - from];
316        for (int sample = from; sample < to; sample++) {
317          values[sample - from] = GetValue(sample, column);
318        }
319        double range = Statistics.Range(values);
320        if (!cachedRanges[column].ContainsKey(from)) cachedRanges[column][from] = new Dictionary<int, double>();
321        cachedRanges[column][from][to] = range;
322        return range;
323      } else {
324        return cachedRanges[column][from][to];
325      }
326    }
327
328    public double GetMaximum(int column) {
329      return GetMaximum(column, 0, Rows);
330    }
331
332    public double GetMaximum(int column, int start, int end) {
333      double max = Double.NegativeInfinity;
334      for (int i = start; i < end; i++) {
335        double val = GetValue(i, column);
336        if (!double.IsNaN(val) && val > max) max = val;
337      }
338      return max;
339    }
340
341    public double GetMinimum(int column) {
342      return GetMinimum(column, 0, Rows);
343    }
344
345    public double GetMinimum(int column, int start, int end) {
346      double min = Double.PositiveInfinity;
347      for (int i = start; i < end; i++) {
348        double val = GetValue(i, column);
349        if (!double.IsNaN(val) && val < min) min = val;
350      }
351      return min;
352    }
353
354    internal void ScaleVariable(int column) {
355      if (scalingFactor[column] == 1.0 && scalingOffset[column] == 0.0) {
356        double min = GetMinimum(column);
357        double max = GetMaximum(column);
358        double range = max - min;
359        if (range == 0) ScaleVariable(column, 1.0, -min);
360        else ScaleVariable(column, 1.0 / range, -min);
361      }
362      cachedValuesInvalidated = true;
363      if (fireChangeEvents) FireChanged();
364    }
365
366    internal void ScaleVariable(int column, double factor, double offset) {
367      scalingFactor[column] = factor;
368      scalingOffset[column] = offset;
369      for (int i = 0; i < Rows; i++) {
370        double origValue = samples[i * columns + column];
371        samples[i * columns + column] = (origValue + offset) * factor;
372      }
373      cachedValuesInvalidated = true;
374      if (fireChangeEvents) FireChanged();
375    }
376
377    internal void UnscaleVariable(int column) {
378      if (scalingFactor[column] != 1.0 || scalingOffset[column] != 0.0) {
379        for (int i = 0; i < rows; i++) {
380          double scaledValue = samples[i * columns + column];
381          samples[i * columns + column] = scaledValue / scalingFactor[column] - scalingOffset[column];
382        }
383        scalingFactor[column] = 1.0;
384        scalingOffset[column] = 0.0;
385      }
386      cachedValuesInvalidated = true;
387      if (fireChangeEvents) FireChanged();
388    }
389
390    private void CreateDictionaries() {
391      // keep a means and ranges dictionary for each column (possible target variable) of the dataset.
392      cachedMeans = new Dictionary<int, Dictionary<int, double>>[columns];
393      cachedRanges = new Dictionary<int, Dictionary<int, double>>[columns];
394      for (int i = 0; i < columns; i++) {
395        cachedMeans[i] = new Dictionary<int, Dictionary<int, double>>();
396        cachedRanges[i] = new Dictionary<int, Dictionary<int, double>>();
397      }
398      cachedValuesInvalidated = false;
399    }
400  }
401}
Note: See TracBrowser for help on using the repository browser.