Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 272

Last change on this file since 272 was 272, checked in by gkronber, 16 years ago

fixed #158

File size: 10.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using HeuristicLab.Data;
27
28namespace HeuristicLab.DataAnalysis {
29  public class DatasetParser {
30    private Tokenizer tokenizer;
31    private Dictionary<string, List<Token>> metadata;
32    private List<List<double>> samplesList;
33
34    private int rows;
35    public int Rows {
36      get { return rows; }
37      set { rows = value; }
38    }
39
40    private int columns;
41    public int Columns {
42      get { return columns; }
43      set { columns = value; }
44    }
45
46    private double[] samples;
47    public double[] Samples {
48      get {
49        return samples;
50      }
51    }
52
53    public string ProblemName {
54      get {
55        return metadata["PROBLEMNAME"][0].stringValue;
56      }
57    }
58
59    public string[] VariableNames {
60      get {
61        List<Token> nameList = metadata["VARIABLENAMES"];
62        string[] names = new string[nameList.Count];
63        for(int i = 0; i < names.Length; i++) {
64          names[i] = nameList[i].stringValue;
65        }
66
67        return names;
68      }
69    }
70
71    public int TargetVariable {
72      get {
73        return metadata["TARGETVARIABLE"][0].intValue;
74      }
75    }
76
77    public int MaxTreeHeight {
78      get {
79        return metadata["MAXIMUMTREEHEIGHT"][0].intValue;
80      }
81    }
82
83    public int MaxTreeSize {
84      get {
85        return metadata["MAXIMUMTREESIZE"][0].intValue;
86      }
87    }
88
89    public int TrainingSamplesStart {
90      get {
91        if(!metadata.ContainsKey("TRAININGSAMPLESSTART")) return 0;
92        else return metadata["TRAININGSAMPLESSTART"][0].intValue;
93      }
94    }
95
96    public int TrainingSamplesEnd {
97      get {
98        if(!metadata.ContainsKey("TRAININGSAMPLESEND")) return rows;
99        else return metadata["TRAININGSAMPLESEND"][0].intValue;
100      }
101    }
102
103    public DatasetParser() {
104      this.metadata = new Dictionary<string, List<Token>>();
105      samplesList = new List<List<double>>();
106    }
107
108    public void Import(string importFileName, bool strict) {
109      StreamReader reader = new StreamReader(importFileName);
110      this.tokenizer = new Tokenizer(reader);
111      tokenizer.Separators = new string[] { " ", ";", "\t" };
112
113      try {
114        // parse the file
115        Parse(strict);
116      } finally {
117        reader.Close();
118      }
119
120      // translate the list of samples into a DoubleMatrixData item
121      samples = new double[samplesList.Count * samplesList[0].Count];
122      rows = samplesList.Count;
123      columns = samplesList[0].Count;
124
125      int i = 0;
126      int j = 0;
127      foreach(List<double> row in samplesList) {
128        j = 0;
129        foreach(double element in row) {
130          samples[i * columns + j] = element;
131          j++;
132        }
133        i++;
134      }
135    }
136
137    #region tokenizer
138    internal enum TokenTypeEnum {
139      At, Assign, NewLine, String, Double, Int
140    }
141
142    internal class Token {
143      public TokenTypeEnum type;
144      public string stringValue;
145      public double doubleValue;
146      public int intValue;
147
148      public Token(TokenTypeEnum type, string value) {
149        this.type = type;
150        stringValue = value;
151        doubleValue = 0.0;
152        intValue = 0;
153      }
154
155      public override string ToString() {
156        return stringValue;
157      }
158    }
159
160
161    class Tokenizer {
162      private StreamReader reader;
163      private List<Token> tokens;
164      private string[] separators;
165
166      public int CurrentLineNumber = 0;
167      public string CurrentLine;
168
169      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
170      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
171      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
172
173      public string[] Separators {
174        get { return separators; }
175        set { separators = value; }
176      }
177
178
179      public Tokenizer(StreamReader reader) {
180        this.reader = reader;
181        tokens = new List<Token>();
182        ReadNextTokens();
183      }
184
185      private void ReadNextTokens() {
186        if(!reader.EndOfStream) {
187          CurrentLine = reader.ReadLine();
188          Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
189            return MakeToken(str);
190          });
191
192          tokens.AddRange(newTokens);
193          tokens.Add(NewlineToken);
194          CurrentLineNumber++;
195        }
196      }
197
198      private Token MakeToken(string strToken) {
199        if(strToken == "@")
200          return AtToken;
201        else if(strToken == "=")
202          return AssignmentToken;
203        else {
204          Token token = new Token(TokenTypeEnum.String, strToken);
205
206          // try invariant culture
207          NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
208          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
209            token.type = TokenTypeEnum.Int;
210            return token;
211          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
212            token.type = TokenTypeEnum.Double;
213            return token;
214          }
215          // try german culture
216          currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
217          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
218            token.type = TokenTypeEnum.Int;
219            return token;
220          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
221            token.type = TokenTypeEnum.Double;
222            return token;
223          }
224
225          // try current culture
226          currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
227          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
228            token.type = TokenTypeEnum.Int;
229            return token;
230          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
231            token.type = TokenTypeEnum.Double;
232            return token;
233          }
234
235          // nothing worked
236          return token;
237        }
238      }
239
240      public Token Peek() {
241        return tokens[0];
242      }
243
244      public Token Next() {
245        Token next = tokens[0];
246        tokens.RemoveAt(0);
247        if(tokens.Count == 0) {
248          ReadNextTokens();
249        }
250        return next;
251      }
252
253      public bool HasNext() {
254        return tokens.Count > 0 || !reader.EndOfStream;
255      }
256    }
257    #endregion
258
259    #region parsing
260    private void Parse(bool strict) {
261      ParseMetaData(strict);
262      ParseSampleData(strict);
263    }
264
265    private void ParseSampleData(bool strict) {
266      List<double> row = new List<double>();
267      while(tokenizer.HasNext()) {
268        Token current = tokenizer.Next();
269        if(current.type == TokenTypeEnum.Double) {
270          // just take the value
271          row.Add(current.doubleValue);
272        } else if(current.type == TokenTypeEnum.Int) {
273          // translate the int value to double
274          row.Add((double)current.intValue);
275        } else if(current == Tokenizer.NewlineToken) {
276          // when parsing strictly all rows have to have the same number of values           
277          if(strict) {
278            // the first row defines how many samples are needed
279            if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
280              Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
281                "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.");
282            }
283          } else if(samplesList.Count > 0) {
284            // when we are not strict then fill or drop elements as needed
285            if(samplesList[0].Count > row.Count) {
286              // fill with NAN
287              for(int i = row.Count; i < samplesList[0].Count; i++) {
288                row.Add(double.NaN);
289              }
290            } else if(samplesList[0].Count < row.Count) {
291              // drop last k elements where k = n - length of first row
292              row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
293            }
294          }
295
296          // add the current row to the collection of rows and start a new row
297          samplesList.Add(row);
298          row = new List<double>();
299        } else {
300          // found an unexpected token => return false when parsing strictly
301          // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
302          if(strict) {
303            Error("Unkown value " + current + " in line " + tokenizer.CurrentLineNumber +
304              "\n" + tokenizer.CurrentLine);
305          } else {
306            row.Add(double.NaN);
307          }
308        }
309      }
310    }
311
312    private void ParseMetaData(bool strict) {
313      while(tokenizer.Peek() == Tokenizer.AtToken) {
314        Expect(Tokenizer.AtToken);
315
316        Token nameToken = tokenizer.Next();
317        if(nameToken.type != TokenTypeEnum.String)
318          throw new Exception("Expected a variable name; got " + nameToken +
319            "\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
320
321        Expect(Tokenizer.AssignmentToken);
322
323        List<Token> tokens = new List<Token>();
324        Token valueToken = tokenizer.Next();
325        while(valueToken != Tokenizer.NewlineToken) {
326          tokens.Add(valueToken);
327          valueToken = tokenizer.Next();
328        }
329
330        metadata[nameToken.stringValue] = tokens;
331      }
332    }
333
334    private void Expect(Token expectedToken) {
335      Token actualToken = tokenizer.Next();
336      if(actualToken != expectedToken) {
337        Error("Expected: " + expectedToken + " got: " + actualToken +
338          "\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
339      }
340    }
341
342    private void Error(string message) {
343      throw new Exception("Error while parsing.\n" + message);
344    }
345    #endregion
346  }
347}
Note: See TracBrowser for help on using the repository browser.