Free cookie consent management tool by TermsFeed Policy Generator

source: branches/Collections/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 771

Last change on this file since 771 was 363, checked in by gkronber, 16 years ago
  • implemented operator to store the best of run solution, in regard of a specific fitness variable).
  • adapted struct-id infrastructure to allow evaluation of models on validation data.

ticket #194

File size: 12.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using HeuristicLab.Data;
27
28namespace HeuristicLab.DataAnalysis {
29  public class DatasetParser {
30    private const string PROBLEMNAME = "PROBLEMNAME";
31    private const string VARIABLENAMES = "VARIABLENAMES";
32    private const string TARGETVARIABLE = "TARGETVARIABLE";
33    private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
34    private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
35    private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
36    private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
37    private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
38    private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
39    private Tokenizer tokenizer;
40    private Dictionary<string, List<Token>> metadata;
41    private List<List<double>> samplesList;
42
43    private int rows;
44    public int Rows {
45      get { return rows; }
46      set { rows = value; }
47    }
48
49    private int columns;
50    public int Columns {
51      get { return columns; }
52      set { columns = value; }
53    }
54
55    private double[] samples;
56    public double[] Samples {
57      get {
58        return samples;
59      }
60    }
61
62    public string ProblemName {
63      get {
64        if(metadata.ContainsKey(PROBLEMNAME)) {
65          return metadata[PROBLEMNAME][0].stringValue;
66        } else return "-";
67      }
68    }
69
70    public string[] VariableNames {
71      get {
72        if(metadata.ContainsKey(VARIABLENAMES)) {
73          List<Token> nameList = metadata[VARIABLENAMES];
74          string[] names = new string[nameList.Count];
75          for(int i = 0; i < names.Length; i++) {
76            names[i] = nameList[i].stringValue;
77          }
78          return names;
79        } else {
80          string[] names = new string[columns];
81          for(int i = 0; i < names.Length; i++) {
82            names[i] = "X" + i.ToString("000");
83          }
84          return names;
85        }
86      }
87    }
88
89    public int TargetVariable {
90      get {
91        if(metadata.ContainsKey(TARGETVARIABLE)) {
92          return metadata[TARGETVARIABLE][0].intValue;
93        } else return 0; // default is the first column
94      }
95    }
96
97    public int MaxTreeHeight {
98      get {
99        if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
100          return metadata[MAXIMUMTREEHEIGHT][0].intValue;
101        } else return 0;
102      }
103    }
104
105    public int MaxTreeSize {
106      get {
107        if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
108          return metadata[MAXIMUMTREESIZE][0].intValue;
109        } else return 0;
110      }
111    }
112
113    public int TrainingSamplesStart {
114      get {
115        if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
116          return metadata[TRAININGSAMPLESSTART][0].intValue;
117        } else return 0;
118      }
119    }
120
121    public int TrainingSamplesEnd {
122      get {
123        if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
124          return metadata[TRAININGSAMPLESEND][0].intValue;
125        } else return rows;
126      }
127    }
128    public int ValidationSamplesStart {
129      get {
130        if(metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
131          return metadata[VALIDATIONSAMPLESSTART][0].intValue;
132        } else return 0;
133      }
134    }
135
136    public int ValidationSamplesEnd {
137      get {
138        if(metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
139          return metadata[VALIDATIONSAMPLESEND][0].intValue;
140        } else return rows;
141      }
142    }
143
144    public DatasetParser() {
145      this.metadata = new Dictionary<string, List<Token>>();
146      samplesList = new List<List<double>>();
147    }
148
149    public void Import(string importFileName, bool strict) {
150      StreamReader reader = new StreamReader(importFileName);
151      this.tokenizer = new Tokenizer(reader);
152      tokenizer.Separators = new string[] { " ", ";", "\t" };
153
154      try {
155        // parse the file
156        Parse(strict);
157      } finally {
158        reader.Close();
159      }
160
161      // translate the list of samples into a DoubleMatrixData item
162      samples = new double[samplesList.Count * samplesList[0].Count];
163      rows = samplesList.Count;
164      columns = samplesList[0].Count;
165
166      int i = 0;
167      int j = 0;
168      foreach(List<double> row in samplesList) {
169        j = 0;
170        foreach(double element in row) {
171          samples[i * columns + j] = element;
172          j++;
173        }
174        i++;
175      }
176    }
177
178    #region tokenizer
179    internal enum TokenTypeEnum {
180      At, Assign, NewLine, String, Double, Int
181    }
182
183    internal class Token {
184      public TokenTypeEnum type;
185      public string stringValue;
186      public double doubleValue;
187      public int intValue;
188
189      public Token(TokenTypeEnum type, string value) {
190        this.type = type;
191        stringValue = value;
192        doubleValue = 0.0;
193        intValue = 0;
194      }
195
196      public override string ToString() {
197        return stringValue;
198      }
199    }
200
201
202    class Tokenizer {
203      private StreamReader reader;
204      private List<Token> tokens;
205      private string[] separators;
206
207      public int CurrentLineNumber = 0;
208      public string CurrentLine;
209
210      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
211      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
212      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
213
214      public string[] Separators {
215        get { return separators; }
216        set { separators = value; }
217      }
218
219
220      public Tokenizer(StreamReader reader) {
221        this.reader = reader;
222        tokens = new List<Token>();
223        ReadNextTokens();
224      }
225
226      private void ReadNextTokens() {
227        if(!reader.EndOfStream) {
228          CurrentLine = reader.ReadLine();
229          Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
230            return MakeToken(str);
231          });
232
233          tokens.AddRange(newTokens);
234          tokens.Add(NewlineToken);
235          CurrentLineNumber++;
236        }
237      }
238
239      private Token MakeToken(string strToken) {
240        if(strToken == "@")
241          return AtToken;
242        else if(strToken == "=")
243          return AssignmentToken;
244        else {
245          Token token = new Token(TokenTypeEnum.String, strToken);
246
247          // try invariant culture
248          NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
249          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
250            token.type = TokenTypeEnum.Int;
251            return token;
252          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
253            token.type = TokenTypeEnum.Double;
254            return token;
255          }
256          // try german culture
257          currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
258          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
259            token.type = TokenTypeEnum.Int;
260            return token;
261          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
262            token.type = TokenTypeEnum.Double;
263            return token;
264          }
265
266          // try current culture
267          currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
268          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
269            token.type = TokenTypeEnum.Int;
270            return token;
271          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
272            token.type = TokenTypeEnum.Double;
273            return token;
274          }
275
276          // nothing worked
277          return token;
278        }
279      }
280
281      public Token Peek() {
282        return tokens[0];
283      }
284
285      public Token Next() {
286        Token next = tokens[0];
287        tokens.RemoveAt(0);
288        if(tokens.Count == 0) {
289          ReadNextTokens();
290        }
291        return next;
292      }
293
294      public bool HasNext() {
295        return tokens.Count > 0 || !reader.EndOfStream;
296      }
297    }
298    #endregion
299
300    #region parsing
301    private void Parse(bool strict) {
302      ParseMetaData(strict);
303      ParseSampleData(strict);
304    }
305
306    private void ParseSampleData(bool strict) {
307      List<double> row = new List<double>();
308      while(tokenizer.HasNext()) {
309        Token current = tokenizer.Next();
310        if(current.type == TokenTypeEnum.Double) {
311          // just take the value
312          row.Add(current.doubleValue);
313        } else if(current.type == TokenTypeEnum.Int) {
314          // translate the int value to double
315          row.Add((double)current.intValue);
316        } else if(current == Tokenizer.NewlineToken) {
317          // when parsing strictly all rows have to have the same number of values           
318          if(strict) {
319            // the first row defines how many samples are needed
320            if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
321              Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
322                "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
323            }
324          } else if(samplesList.Count > 0) {
325            // when we are not strict then fill or drop elements as needed
326            if(samplesList[0].Count > row.Count) {
327              // fill with NAN
328              for(int i = row.Count; i < samplesList[0].Count; i++) {
329                row.Add(double.NaN);
330              }
331            } else if(samplesList[0].Count < row.Count) {
332              // drop last k elements where k = n - length of first row
333              row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
334            }
335          }
336
337          // add the current row to the collection of rows and start a new row
338          samplesList.Add(row);
339          row = new List<double>();
340        } else {
341          // found an unexpected token => return false when parsing strictly
342          // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
343          if(strict) {
344            Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
345          } else {
346            row.Add(double.NaN);
347          }
348        }
349      }
350    }
351
352    private void ParseMetaData(bool strict) {
353      while(tokenizer.Peek() == Tokenizer.AtToken) {
354        Expect(Tokenizer.AtToken);
355
356        Token nameToken = tokenizer.Next();
357        if(nameToken.type != TokenTypeEnum.String)
358          Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
359
360        Expect(Tokenizer.AssignmentToken);
361
362        List<Token> tokens = new List<Token>();
363        Token valueToken = tokenizer.Next();
364        while(valueToken != Tokenizer.NewlineToken) {
365          tokens.Add(valueToken);
366          valueToken = tokenizer.Next();
367        }
368
369        metadata[nameToken.stringValue] = tokens;
370      }
371    }
372
373    private void Expect(Token expectedToken) {
374      Token actualToken = tokenizer.Next();
375      if(actualToken != expectedToken) {
376        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
377      }
378    }
379
380    private void Error(string message, string token, int lineNumber) {
381      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
382    }
383    #endregion
384  }
385}
Note: See TracBrowser for help on using the repository browser.