Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 415

Last change on this file since 415 was 406, checked in by gkronber, 16 years ago

fixed #209 by changing the import format while keeping compatibility with HL2 exported files

File size: 12.7 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using HeuristicLab.Data;
27
28namespace HeuristicLab.DataAnalysis {
29  public class DatasetParser {
[273]30    private const string PROBLEMNAME = "PROBLEMNAME";
31    private const string VARIABLENAMES = "VARIABLENAMES";
32    private const string TARGETVARIABLE = "TARGETVARIABLE";
33    private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
34    private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
35    private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
36    private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
[363]37    private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
38    private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
[397]39    private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";
40    private const string TESTSAMPLESEND = "TESTSAMPLESEND";
[2]41    private Tokenizer tokenizer;
42    private Dictionary<string, List<Token>> metadata;
43    private List<List<double>> samplesList;
44
45    private int rows;
46    public int Rows {
47      get { return rows; }
48      set { rows = value; }
49    }
50
51    private int columns;
52    public int Columns {
53      get { return columns; }
54      set { columns = value; }
55    }
56
57    private double[] samples;
58    public double[] Samples {
59      get {
60        return samples;
61      }
62    }
63
64    public string ProblemName {
65      get {
[273]66        if(metadata.ContainsKey(PROBLEMNAME)) {
67          return metadata[PROBLEMNAME][0].stringValue;
68        } else return "-";
[2]69      }
70    }
71
72    public string[] VariableNames {
73      get {
[273]74        if(metadata.ContainsKey(VARIABLENAMES)) {
75          List<Token> nameList = metadata[VARIABLENAMES];
76          string[] names = new string[nameList.Count];
77          for(int i = 0; i < names.Length; i++) {
78            names[i] = nameList[i].stringValue;
79          }
80          return names;
81        } else {
82          string[] names = new string[columns];
83          for(int i = 0; i < names.Length; i++) {
84            names[i] = "X" + i.ToString("000");
85          }
86          return names;
[2]87        }
88      }
89    }
90
91    public int TargetVariable {
92      get {
[273]93        if(metadata.ContainsKey(TARGETVARIABLE)) {
94          return metadata[TARGETVARIABLE][0].intValue;
95        } else return 0; // default is the first column
[2]96      }
97    }
98
99    public int MaxTreeHeight {
100      get {
[273]101        if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
102          return metadata[MAXIMUMTREEHEIGHT][0].intValue;
103        } else return 0;
[2]104      }
105    }
106
107    public int MaxTreeSize {
108      get {
[273]109        if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
110          return metadata[MAXIMUMTREESIZE][0].intValue;
111        } else return 0;
[2]112      }
113    }
114
115    public int TrainingSamplesStart {
116      get {
[273]117        if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
118          return metadata[TRAININGSAMPLESSTART][0].intValue;
119        } else return 0;
[2]120      }
121    }
122
123    public int TrainingSamplesEnd {
124      get {
[273]125        if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
126          return metadata[TRAININGSAMPLESEND][0].intValue;
127        } else return rows;
[2]128      }
129    }
[363]130    public int ValidationSamplesStart {
131      get {
132        if(metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
133          return metadata[VALIDATIONSAMPLESSTART][0].intValue;
134        } else return 0;
135      }
136    }
[2]137
[363]138    public int ValidationSamplesEnd {
139      get {
140        if(metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
141          return metadata[VALIDATIONSAMPLESEND][0].intValue;
142        } else return rows;
143      }
144    }
[397]145    public int TestSamplesStart {
146      get {
147        if(metadata.ContainsKey(TESTSAMPLESSTART)) {
148          return metadata[TESTSAMPLESSTART][0].intValue;
149        } else return 0;
150      }
151    }
[363]152
[397]153    public int TestSamplesEnd {
154      get {
155        if(metadata.ContainsKey(TESTSAMPLESEND)) {
156          return metadata[TESTSAMPLESEND][0].intValue;
157        } else return rows;
158      }
159    }
160
[2]161    public DatasetParser() {
162      this.metadata = new Dictionary<string, List<Token>>();
163      samplesList = new List<List<double>>();
164    }
165
[404]166    public void Reset() {
167      metadata.Clear();
168      samplesList.Clear();
169    }
170
[2]171    public void Import(string importFileName, bool strict) {
[405]172      TryParse(importFileName, strict);
[2]173      // translate the list of samples into a DoubleMatrixData item
174      samples = new double[samplesList.Count * samplesList[0].Count];
175      rows = samplesList.Count;
176      columns = samplesList[0].Count;
177
178      int i = 0;
179      int j = 0;
[272]180      foreach(List<double> row in samplesList) {
[2]181        j = 0;
[272]182        foreach(double element in row) {
[2]183          samples[i * columns + j] = element;
184          j++;
185        }
186        i++;
187      }
188    }
189
[405]190    private void TryParse(string importFileName, bool strict) {
191      Exception lastEx = null;
192      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo, CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo };
193      foreach(NumberFormatInfo numberFormat in possibleFormats) {
194        using(StreamReader reader = new StreamReader(importFileName)) {
195          tokenizer = new Tokenizer(reader, numberFormat);
196          try {
197            // parse the file
198            Parse(strict);
199            return; // parsed without errors -> return;
200          } catch(DataFormatException ex) {
201            lastEx = ex;
202          }
203        }
204      }
205      // all number formats threw an exception -> rethrow the last exception
206      throw lastEx;
207    }
208
[2]209    #region tokenizer
210    internal enum TokenTypeEnum {
[406]211      At, Assign, NewLine, String, Double, Int, WhiteSpace
[2]212    }
213
214    internal class Token {
215      public TokenTypeEnum type;
216      public string stringValue;
217      public double doubleValue;
218      public int intValue;
219
220      public Token(TokenTypeEnum type, string value) {
221        this.type = type;
222        stringValue = value;
223        doubleValue = 0.0;
224        intValue = 0;
225      }
226
227      public override string ToString() {
228        return stringValue;
229      }
230    }
231
232
233    class Tokenizer {
234      private StreamReader reader;
235      private List<Token> tokens;
[406]236      private string[] separators = new string[] { "@", "=", ";", "\t" };
[405]237      private NumberFormatInfo numberFormatInfo;
[2]238
239      public int CurrentLineNumber = 0;
240      public string CurrentLine;
241
242      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
243      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
244      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
[406]245      public static Token SeparatorToken = new Token(TokenTypeEnum.WhiteSpace, "");
[2]246      public string[] Separators {
247        get { return separators; }
248        set { separators = value; }
249      }
250
251
[405]252      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) {
[2]253        this.reader = reader;
[405]254        this.numberFormatInfo = numberFormatInfo;
[2]255        tokens = new List<Token>();
256        ReadNextTokens();
257      }
258
259      private void ReadNextTokens() {
[272]260        if(!reader.EndOfStream) {
[2]261          CurrentLine = reader.ReadLine();
262          Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
[406]263            return MakeToken(str.Trim());
[2]264          });
265
[406]266          foreach(Token tok in newTokens) {
267            if(tok != SeparatorToken) tokens.Add(tok);
268          }
[2]269          tokens.Add(NewlineToken);
270          CurrentLineNumber++;
271        }
272      }
273
274      private Token MakeToken(string strToken) {
[406]275        Token token = new Token(TokenTypeEnum.String, strToken);
[2]276
[406]277        // try to parse as a number first
278        if(int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) {
279          token.type = TokenTypeEnum.Int;
[2]280          return token;
[406]281        } else if(double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
282          token.type = TokenTypeEnum.Double;
283          return token;
[2]284        }
[406]285        // couldn't parse the token as an int or float number so return a string token
286        return token;
[2]287      }
288
289      public Token Peek() {
290        return tokens[0];
291      }
292
293      public Token Next() {
294        Token next = tokens[0];
295        tokens.RemoveAt(0);
[272]296        if(tokens.Count == 0) {
[2]297          ReadNextTokens();
298        }
299        return next;
300      }
301
302      public bool HasNext() {
303        return tokens.Count > 0 || !reader.EndOfStream;
304      }
305    }
306    #endregion
307
308    #region parsing
309    private void Parse(bool strict) {
310      ParseMetaData(strict);
311      ParseSampleData(strict);
312    }
313
314    private void ParseSampleData(bool strict) {
315      List<double> row = new List<double>();
[272]316      while(tokenizer.HasNext()) {
[2]317        Token current = tokenizer.Next();
[272]318        if(current.type == TokenTypeEnum.Double) {
[2]319          // just take the value
320          row.Add(current.doubleValue);
[272]321        } else if(current.type == TokenTypeEnum.Int) {
[2]322          // translate the int value to double
323          row.Add((double)current.intValue);
[272]324        } else if(current == Tokenizer.NewlineToken) {
[2]325          // when parsing strictly all rows have to have the same number of values           
[272]326          if(strict) {
[2]327            // the first row defines how many samples are needed
[272]328            if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
[2]329              Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
[273]330                "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2]331            }
[272]332          } else if(samplesList.Count > 0) {
[2]333            // when we are not strict then fill or drop elements as needed
[272]334            if(samplesList[0].Count > row.Count) {
[2]335              // fill with NAN
[272]336              for(int i = row.Count; i < samplesList[0].Count; i++) {
[2]337                row.Add(double.NaN);
338              }
[272]339            } else if(samplesList[0].Count < row.Count) {
[2]340              // drop last k elements where k = n - length of first row
341              row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
342            }
343          }
344
345          // add the current row to the collection of rows and start a new row
346          samplesList.Add(row);
347          row = new List<double>();
348        } else {
349          // found an unexpected token => return false when parsing strictly
350          // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
[272]351          if(strict) {
[273]352            Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
[2]353          } else {
354            row.Add(double.NaN);
355          }
356        }
357      }
358    }
359
360    private void ParseMetaData(bool strict) {
[406]361      while(tokenizer.Peek().type==TokenTypeEnum.String) {
[2]362        Token nameToken = tokenizer.Next();
[272]363        if(nameToken.type != TokenTypeEnum.String)
[273]364          Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
[2]365
366        List<Token> tokens = new List<Token>();
367        Token valueToken = tokenizer.Next();
[272]368        while(valueToken != Tokenizer.NewlineToken) {
[2]369          tokens.Add(valueToken);
370          valueToken = tokenizer.Next();
371        }
372
373        metadata[nameToken.stringValue] = tokens;
374      }
375    }
376
377    private void Expect(Token expectedToken) {
378      Token actualToken = tokenizer.Next();
[272]379      if(actualToken != expectedToken) {
[273]380        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]381      }
382    }
383
[273]384    private void Error(string message, string token, int lineNumber) {
385      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]386    }
387    #endregion
388  }
389}
Note: See TracBrowser for help on using the repository browser.