Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 771

Last change on this file since 771 was 620, checked in by gkronber, 16 years ago

added checks if the imported data seems valid and throw DataFormatException which should be handled by objects that use the DatasetParser. (fixed #292)

File size: 13.6 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using HeuristicLab.Data;
27
28namespace HeuristicLab.DataAnalysis {
29  public class DatasetParser {
[273]30    private const string PROBLEMNAME = "PROBLEMNAME";
31    private const string VARIABLENAMES = "VARIABLENAMES";
32    private const string TARGETVARIABLE = "TARGETVARIABLE";
33    private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
34    private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
35    private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
36    private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
[363]37    private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
38    private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
[397]39    private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";
40    private const string TESTSAMPLESEND = "TESTSAMPLESEND";
[487]41    private const string NONINPUTVARIABLES = "NONINPUTVARIABLES";
[2]42    private Tokenizer tokenizer;
43    private Dictionary<string, List<Token>> metadata;
44    private List<List<double>> samplesList;
45
46    private int rows;
47    public int Rows {
48      get { return rows; }
49      set { rows = value; }
50    }
51
52    private int columns;
53    public int Columns {
54      get { return columns; }
55      set { columns = value; }
56    }
57
58    private double[] samples;
59    public double[] Samples {
60      get {
61        return samples;
62      }
63    }
64
65    public string ProblemName {
66      get {
[273]67        if(metadata.ContainsKey(PROBLEMNAME)) {
68          return metadata[PROBLEMNAME][0].stringValue;
69        } else return "-";
[2]70      }
71    }
72
73    public string[] VariableNames {
74      get {
[273]75        if(metadata.ContainsKey(VARIABLENAMES)) {
76          List<Token> nameList = metadata[VARIABLENAMES];
77          string[] names = new string[nameList.Count];
78          for(int i = 0; i < names.Length; i++) {
79            names[i] = nameList[i].stringValue;
80          }
81          return names;
82        } else {
83          string[] names = new string[columns];
84          for(int i = 0; i < names.Length; i++) {
85            names[i] = "X" + i.ToString("000");
86          }
87          return names;
[2]88        }
89      }
90    }
91
92    public int TargetVariable {
93      get {
[273]94        if(metadata.ContainsKey(TARGETVARIABLE)) {
95          return metadata[TARGETVARIABLE][0].intValue;
96        } else return 0; // default is the first column
[2]97      }
98    }
99
100    public int MaxTreeHeight {
101      get {
[273]102        if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
103          return metadata[MAXIMUMTREEHEIGHT][0].intValue;
104        } else return 0;
[2]105      }
106    }
107
108    public int MaxTreeSize {
109      get {
[273]110        if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
111          return metadata[MAXIMUMTREESIZE][0].intValue;
112        } else return 0;
[2]113      }
114    }
115
116    public int TrainingSamplesStart {
117      get {
[273]118        if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
119          return metadata[TRAININGSAMPLESSTART][0].intValue;
120        } else return 0;
[2]121      }
122    }
123
124    public int TrainingSamplesEnd {
125      get {
[273]126        if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
127          return metadata[TRAININGSAMPLESEND][0].intValue;
128        } else return rows;
[2]129      }
130    }
[363]131    public int ValidationSamplesStart {
132      get {
133        if(metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
134          return metadata[VALIDATIONSAMPLESSTART][0].intValue;
135        } else return 0;
136      }
137    }
[2]138
[363]139    public int ValidationSamplesEnd {
140      get {
141        if(metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
142          return metadata[VALIDATIONSAMPLESEND][0].intValue;
143        } else return rows;
144      }
145    }
[397]146    public int TestSamplesStart {
147      get {
148        if(metadata.ContainsKey(TESTSAMPLESSTART)) {
149          return metadata[TESTSAMPLESSTART][0].intValue;
150        } else return 0;
151      }
152    }
[363]153
[397]154    public int TestSamplesEnd {
155      get {
156        if(metadata.ContainsKey(TESTSAMPLESEND)) {
157          return metadata[TESTSAMPLESEND][0].intValue;
158        } else return rows;
159      }
160    }
161
[487]162    public List<int> NonInputVariables {
163      get {
164        List<int> disallowedVariables = new List<int>();
165        if(metadata.ContainsKey(NONINPUTVARIABLES)) {
166          foreach(Token t in metadata[NONINPUTVARIABLES]) {
167            disallowedVariables.Add(t.intValue);
168          }
169        }
170        return disallowedVariables;
171      }
172    }
173
[2]174    public DatasetParser() {
175      this.metadata = new Dictionary<string, List<Token>>();
176      samplesList = new List<List<double>>();
177    }
178
[404]179    public void Reset() {
180      metadata.Clear();
181      samplesList.Clear();
182    }
183
[2]184    public void Import(string importFileName, bool strict) {
[405]185      TryParse(importFileName, strict);
[2]186      // translate the list of samples into a DoubleMatrixData item
187      samples = new double[samplesList.Count * samplesList[0].Count];
188      rows = samplesList.Count;
189      columns = samplesList[0].Count;
190
191      int i = 0;
192      int j = 0;
[272]193      foreach(List<double> row in samplesList) {
[2]194        j = 0;
[272]195        foreach(double element in row) {
[2]196          samples[i * columns + j] = element;
197          j++;
198        }
199        i++;
200      }
201    }
202
[405]203    private void TryParse(string importFileName, bool strict) {
204      Exception lastEx = null;
205      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo, CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo };
206      foreach(NumberFormatInfo numberFormat in possibleFormats) {
207        using(StreamReader reader = new StreamReader(importFileName)) {
208          tokenizer = new Tokenizer(reader, numberFormat);
209          try {
210            // parse the file
211            Parse(strict);
212            return; // parsed without errors -> return;
213          } catch(DataFormatException ex) {
214            lastEx = ex;
215          }
216        }
217      }
218      // all number formats threw an exception -> rethrow the last exception
219      throw lastEx;
220    }
221
[2]222    #region tokenizer
223    internal enum TokenTypeEnum {
[406]224      At, Assign, NewLine, String, Double, Int, WhiteSpace
[2]225    }
226
227    internal class Token {
228      public TokenTypeEnum type;
229      public string stringValue;
230      public double doubleValue;
231      public int intValue;
232
233      public Token(TokenTypeEnum type, string value) {
234        this.type = type;
235        stringValue = value;
236        doubleValue = 0.0;
237        intValue = 0;
238      }
239
240      public override string ToString() {
241        return stringValue;
242      }
243    }
244
245
246    class Tokenizer {
247      private StreamReader reader;
248      private List<Token> tokens;
[406]249      private string[] separators = new string[] { "@", "=", ";", "\t" };
[405]250      private NumberFormatInfo numberFormatInfo;
[2]251
252      public int CurrentLineNumber = 0;
253      public string CurrentLine;
254
255      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
256      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
257      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
[406]258      public static Token SeparatorToken = new Token(TokenTypeEnum.WhiteSpace, "");
[2]259      public string[] Separators {
260        get { return separators; }
261        set { separators = value; }
262      }
263
264
[405]265      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) {
[2]266        this.reader = reader;
[405]267        this.numberFormatInfo = numberFormatInfo;
[2]268        tokens = new List<Token>();
269        ReadNextTokens();
270      }
271
272      private void ReadNextTokens() {
[272]273        if(!reader.EndOfStream) {
[2]274          CurrentLine = reader.ReadLine();
275          Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
[406]276            return MakeToken(str.Trim());
[2]277          });
278
[406]279          foreach(Token tok in newTokens) {
280            if(tok != SeparatorToken) tokens.Add(tok);
281          }
[2]282          tokens.Add(NewlineToken);
283          CurrentLineNumber++;
284        }
285      }
286
287      private Token MakeToken(string strToken) {
[406]288        Token token = new Token(TokenTypeEnum.String, strToken);
[2]289
[406]290        // try to parse as a number first
291        if(int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) {
292          token.type = TokenTypeEnum.Int;
[2]293          return token;
[406]294        } else if(double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
295          token.type = TokenTypeEnum.Double;
296          return token;
[2]297        }
[406]298        // couldn't parse the token as an int or float number so return a string token
299        return token;
[2]300      }
301
302      public Token Peek() {
303        return tokens[0];
304      }
305
306      public Token Next() {
307        Token next = tokens[0];
308        tokens.RemoveAt(0);
[272]309        if(tokens.Count == 0) {
[2]310          ReadNextTokens();
311        }
312        return next;
313      }
314
315      public bool HasNext() {
316        return tokens.Count > 0 || !reader.EndOfStream;
317      }
318    }
319    #endregion
320
321    #region parsing
322    private void Parse(bool strict) {
323      ParseMetaData(strict);
[620]324      if(!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]325      ParseSampleData(strict);
[620]326      if(samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]327    }
328
329    private void ParseSampleData(bool strict) {
330      List<double> row = new List<double>();
[272]331      while(tokenizer.HasNext()) {
[2]332        Token current = tokenizer.Next();
[272]333        if(current.type == TokenTypeEnum.Double) {
[2]334          // just take the value
335          row.Add(current.doubleValue);
[272]336        } else if(current.type == TokenTypeEnum.Int) {
[2]337          // translate the int value to double
338          row.Add((double)current.intValue);
[272]339        } else if(current == Tokenizer.NewlineToken) {
[2]340          // when parsing strictly all rows have to have the same number of values           
[272]341          if(strict) {
[2]342            // the first row defines how many samples are needed
[272]343            if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
[2]344              Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
[273]345                "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2]346            }
[272]347          } else if(samplesList.Count > 0) {
[2]348            // when we are not strict then fill or drop elements as needed
[272]349            if(samplesList[0].Count > row.Count) {
[2]350              // fill with NAN
[272]351              for(int i = row.Count; i < samplesList[0].Count; i++) {
[2]352                row.Add(double.NaN);
353              }
[272]354            } else if(samplesList[0].Count < row.Count) {
[2]355              // drop last k elements where k = n - length of first row
356              row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
357            }
358          }
359
360          // add the current row to the collection of rows and start a new row
361          samplesList.Add(row);
362          row = new List<double>();
363        } else {
364          // found an unexpected token => return false when parsing strictly
365          // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
[272]366          if(strict) {
[273]367            Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
[2]368          } else {
369            row.Add(double.NaN);
370          }
371        }
372      }
373    }
374
375    private void ParseMetaData(bool strict) {
[620]376      while(tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.String) {
[2]377        Token nameToken = tokenizer.Next();
[272]378        if(nameToken.type != TokenTypeEnum.String)
[273]379          Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
[2]380
381        List<Token> tokens = new List<Token>();
382        Token valueToken = tokenizer.Next();
[272]383        while(valueToken != Tokenizer.NewlineToken) {
[2]384          tokens.Add(valueToken);
385          valueToken = tokenizer.Next();
386        }
387
388        metadata[nameToken.stringValue] = tokens;
389      }
390    }
391
392    private void Expect(Token expectedToken) {
393      Token actualToken = tokenizer.Next();
[272]394      if(actualToken != expectedToken) {
[273]395        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]396      }
397    }
398
[273]399    private void Error(string message, string token, int lineNumber) {
400      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]401    }
402    #endregion
403  }
404}
Note: See TracBrowser for help on using the repository browser.