Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 202

Last change on this file since 202 was 173, checked in by gkronber, 17 years ago

fixed a bug non-matching string constant

File size: 10.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using HeuristicLab.Data;
27
28namespace HeuristicLab.DataAnalysis {
29  public class DatasetParser {
30    private Tokenizer tokenizer;
31    private Dictionary<string, List<Token>> metadata;
32    private List<List<double>> samplesList;
33
34    private int rows;
35    public int Rows {
36      get { return rows; }
37      set { rows = value; }
38    }
39
40    private int columns;
41    public int Columns {
42      get { return columns; }
43      set { columns = value; }
44    }
45
46    private double[] samples;
47    public double[] Samples {
48      get {
49        return samples;
50      }
51    }
52
53    public string ProblemName {
54      get {
55        return metadata["PROBLEMNAME"][0].stringValue;
56      }
57    }
58
59    public string[] VariableNames {
60      get {
61        List<Token> nameList = metadata["VARIABLENAMES"];
62        string[] names = new string[nameList.Count];
63        for (int i = 0; i < names.Length; i++) {
64          names[i] = nameList[i].stringValue;
65        }
66
67        return names;
68      }
69    }
70
71    public int TargetVariable {
72      get {
73        return metadata["TARGETVARIABLE"][0].intValue;
74      }
75    }
76
77    public int MaxTreeHeight {
78      get {
79        return metadata["MAXIMUMTREEHEIGHT"][0].intValue;
80      }
81    }
82
83    public int MaxTreeSize {
84      get {
85        return metadata["MAXIMUMTREESIZE"][0].intValue;
86      }
87    }
88
89    public int TrainingSamplesStart {
90      get {
91        if(!metadata.ContainsKey("TRAININGSAMPLESSTART")) return 0;
92        else return metadata["TRAININGSAMPLESSTART"][0].intValue;
93      }
94    }
95
96    public int TrainingSamplesEnd {
97      get {
98        if(!metadata.ContainsKey("TRAININGSAMPLESEND")) return rows;
99        else return metadata["TRAININGSAMPLESEND"][0].intValue;
100      }
101    }
102
103    public DatasetParser() {
104      this.metadata = new Dictionary<string, List<Token>>();
105      samplesList = new List<List<double>>();
106    }
107
108    public void Import(string importFileName, bool strict) {
109      StreamReader reader = new StreamReader(importFileName);
110      this.tokenizer = new Tokenizer(reader);
111      tokenizer.Separators = new string[] { " ", ";", "\t" };
112
113      // parse the file
114      Parse(strict);
115
116      // translate the list of samples into a DoubleMatrixData item
117      samples = new double[samplesList.Count * samplesList[0].Count];
118      rows = samplesList.Count;
119      columns = samplesList[0].Count;
120
121      int i = 0;
122      int j = 0;
123      foreach (List<double> row in samplesList) {
124        j = 0;
125        foreach (double element in row) {
126          samples[i * columns + j] = element;
127          j++;
128        }
129        i++;
130      }
131    }
132
133    #region tokenizer
134    internal enum TokenTypeEnum {
135      At, Assign, NewLine, String, Double, Int
136    }
137
138    internal class Token {
139      public TokenTypeEnum type;
140      public string stringValue;
141      public double doubleValue;
142      public int intValue;
143
144      public Token(TokenTypeEnum type, string value) {
145        this.type = type;
146        stringValue = value;
147        doubleValue = 0.0;
148        intValue = 0;
149      }
150
151      public override string ToString() {
152        return stringValue;
153      }
154    }
155
156
157    class Tokenizer {
158      private StreamReader reader;
159      private List<Token> tokens;
160      private string[] separators;
161
162      public int CurrentLineNumber = 0;
163      public string CurrentLine;
164
165      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
166      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
167      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
168
169      public string[] Separators {
170        get { return separators; }
171        set { separators = value; }
172      }
173
174
175      public Tokenizer(StreamReader reader) {
176        this.reader = reader;
177        tokens = new List<Token>();
178        ReadNextTokens();
179      }
180
181      private void ReadNextTokens() {
182        if (!reader.EndOfStream) {
183          CurrentLine = reader.ReadLine();
184          Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
185            return MakeToken(str);
186          });
187
188          tokens.AddRange(newTokens);
189          tokens.Add(NewlineToken);
190          CurrentLineNumber++;
191        }
192      }
193
194      private Token MakeToken(string strToken) {
195        if (strToken == "@")
196          return AtToken;
197        else if (strToken == "=")
198          return AssignmentToken;
199        else {
200          Token token = new Token(TokenTypeEnum.String, strToken);
201
202          // try invariant culture
203          NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
204          if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
205            token.type = TokenTypeEnum.Int;
206            return token;
207          } else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
208            token.type = TokenTypeEnum.Double;
209            return token;
210          }
211          // try german culture
212          currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
213          if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
214            token.type = TokenTypeEnum.Int;
215            return token;
216          } else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
217            token.type = TokenTypeEnum.Double;
218            return token;
219          }
220
221          // try current culture
222          currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
223          if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
224            token.type = TokenTypeEnum.Int;
225            return token;
226          } else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
227            token.type = TokenTypeEnum.Double;
228            return token;
229          }
230
231          // nothing worked
232          return token;
233        }
234      }
235
236      public Token Peek() {
237        return tokens[0];
238      }
239
240      public Token Next() {
241        Token next = tokens[0];
242        tokens.RemoveAt(0);
243        if (tokens.Count == 0) {
244          ReadNextTokens();
245        }
246        return next;
247      }
248
249      public bool HasNext() {
250        return tokens.Count > 0 || !reader.EndOfStream;
251      }
252    }
253    #endregion
254
255    #region parsing
256    private void Parse(bool strict) {
257      ParseMetaData(strict);
258      ParseSampleData(strict);
259    }
260
261    private void ParseSampleData(bool strict) {
262      List<double> row = new List<double>();
263      while (tokenizer.HasNext()) {
264        Token current = tokenizer.Next();
265        if (current.type == TokenTypeEnum.Double) {
266          // just take the value
267          row.Add(current.doubleValue);
268        } else if (current.type == TokenTypeEnum.Int) {
269          // translate the int value to double
270          row.Add((double)current.intValue);
271        } else if (current == Tokenizer.NewlineToken) {
272          // when parsing strictly all rows have to have the same number of values           
273          if (strict) {
274            // the first row defines how many samples are needed
275            if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
276              Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
277                "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.");
278            }
279          } else if (samplesList.Count > 0) {
280            // when we are not strict then fill or drop elements as needed
281            if (samplesList[0].Count > row.Count) {
282              // fill with NAN
283              for (int i = row.Count; i < samplesList[0].Count; i++) {
284                row.Add(double.NaN);
285              }
286            } else if (samplesList[0].Count < row.Count) {
287              // drop last k elements where k = n - length of first row
288              row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
289            }
290          }
291
292          // add the current row to the collection of rows and start a new row
293          samplesList.Add(row);
294          row = new List<double>();
295        } else {
296          // found an unexpected token => return false when parsing strictly
297          // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
298          if (strict) {
299            Error("Unkown value " + current + " in line " + tokenizer.CurrentLineNumber +
300              "\n" + tokenizer.CurrentLine);
301          } else {
302            row.Add(double.NaN);
303          }
304        }
305      }
306    }
307
308    private void ParseMetaData(bool strict) {
309      while (tokenizer.Peek() == Tokenizer.AtToken) {
310        Expect(Tokenizer.AtToken);
311
312        Token nameToken = tokenizer.Next();
313        if (nameToken.type != TokenTypeEnum.String)
314          throw new Exception("Expected a variable name; got " + nameToken +
315            "\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
316
317        Expect(Tokenizer.AssignmentToken);
318
319        List<Token> tokens = new List<Token>();
320        Token valueToken = tokenizer.Next();
321        while (valueToken != Tokenizer.NewlineToken) {
322          tokens.Add(valueToken);
323          valueToken = tokenizer.Next();
324        }
325
326        metadata[nameToken.stringValue] = tokens;
327      }
328    }
329
330    private void Expect(Token expectedToken) {
331      Token actualToken = tokenizer.Next();
332      if (actualToken != expectedToken) {
333        Error("Expected: " + expectedToken + " got: " + actualToken +
334          "\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
335      }
336    }
337
338    private void Error(string message) {
339      throw new Exception("Error while parsing.\n" + message);
340    }
341    #endregion
342  }
343}
Note: See TracBrowser for help on using the repository browser.