Free cookie consent management tool by TermsFeed Policy Generator

source: branches/XmlTextWriterBranch/HeuristicLab.DataAnalysis/DatasetParser.cs @ 143

Last change on this file since 143 was 2, checked in by swagner, 17 years ago

Added HeuristicLab 3.0 sources from former SVN repository at revision 52

File size: 10.7 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using HeuristicLab.Data;
27
28namespace HeuristicLab.DataAnalysis {
29  public class DatasetParser {
30    private Tokenizer tokenizer;
31    private Dictionary<string, List<Token>> metadata;
32    private List<List<double>> samplesList;
33
34    private int rows;
35    public int Rows {
36      get { return rows; }
37      set { rows = value; }
38    }
39
40    private int columns;
41    public int Columns {
42      get { return columns; }
43      set { columns = value; }
44    }
45
46    private double[] samples;
47    public double[] Samples {
48      get {
49        return samples;
50      }
51    }
52
53    public string ProblemName {
54      get {
55        return metadata["PROBLEMNAME"][0].stringValue;
56      }
57    }
58
59    public string[] VariableNames {
60      get {
61        List<Token> nameList = metadata["VARIABLENAMES"];
62        string[] names = new string[nameList.Count];
63        for (int i = 0; i < names.Length; i++) {
64          names[i] = nameList[i].stringValue;
65        }
66
67        return names;
68      }
69    }
70
71    public int TargetVariable {
72      get {
73        return metadata["TARGETVARIABLE"][0].intValue;
74      }
75    }
76
77    public int MaxTreeHeight {
78      get {
79        return metadata["MAXIMUMTREEHEIGHT"][0].intValue;
80      }
81    }
82
83    public int MaxTreeSize {
84      get {
85        return metadata["MAXIMUMTREESIZE"][0].intValue;
86      }
87    }
88
89    public int TrainingSamplesStart {
90      get {
91        return metadata["TRAININGSAMPLESSTART"][0].intValue;
92      }
93    }
94
95    public int TrainingSamplesEnd {
96      get {
97        return metadata["TRAININGSAMPLESEND"][0].intValue;
98      }
99    }
100
101    public DatasetParser() {
102      this.metadata = new Dictionary<string, List<Token>>();
103      samplesList = new List<List<double>>();
104    }
105
106    public void Import(string importFileName, bool strict) {
107      StreamReader reader = new StreamReader(importFileName);
108      this.tokenizer = new Tokenizer(reader);
109      tokenizer.Separators = new string[] { " ", ";", "\t" };
110
111      // parse the file
112      Parse(strict);
113
114      // translate the list of samples into a DoubleMatrixData item
115      samples = new double[samplesList.Count * samplesList[0].Count];
116      rows = samplesList.Count;
117      columns = samplesList[0].Count;
118
119      int i = 0;
120      int j = 0;
121      foreach (List<double> row in samplesList) {
122        j = 0;
123        foreach (double element in row) {
124          samples[i * columns + j] = element;
125          j++;
126        }
127        i++;
128      }
129    }
130
131    #region tokenizer
132    internal enum TokenTypeEnum {
133      At, Assign, NewLine, String, Double, Int
134    }
135
136    internal class Token {
137      public TokenTypeEnum type;
138      public string stringValue;
139      public double doubleValue;
140      public int intValue;
141
142      public Token(TokenTypeEnum type, string value) {
143        this.type = type;
144        stringValue = value;
145        doubleValue = 0.0;
146        intValue = 0;
147      }
148
149      public override string ToString() {
150        return stringValue;
151      }
152    }
153
154
155    class Tokenizer {
156      private StreamReader reader;
157      private List<Token> tokens;
158      private string[] separators;
159
160      public int CurrentLineNumber = 0;
161      public string CurrentLine;
162
163      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
164      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
165      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
166
167      public string[] Separators {
168        get { return separators; }
169        set { separators = value; }
170      }
171
172
173      public Tokenizer(StreamReader reader) {
174        this.reader = reader;
175        tokens = new List<Token>();
176        ReadNextTokens();
177      }
178
179      private void ReadNextTokens() {
180        if (!reader.EndOfStream) {
181          CurrentLine = reader.ReadLine();
182          Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
183            return MakeToken(str);
184          });
185
186          tokens.AddRange(newTokens);
187          tokens.Add(NewlineToken);
188          CurrentLineNumber++;
189        }
190      }
191
192      private Token MakeToken(string strToken) {
193        if (strToken == "@")
194          return AtToken;
195        else if (strToken == "=")
196          return AssignmentToken;
197        else {
198          Token token = new Token(TokenTypeEnum.String, strToken);
199
200          // try invariant culture
201          NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
202          if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
203            token.type = TokenTypeEnum.Int;
204            return token;
205          } else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
206            token.type = TokenTypeEnum.Double;
207            return token;
208          }
209          // try german culture
210          currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
211          if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
212            token.type = TokenTypeEnum.Int;
213            return token;
214          } else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
215            token.type = TokenTypeEnum.Double;
216            return token;
217          }
218
219          // try current culture
220          currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
221          if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
222            token.type = TokenTypeEnum.Int;
223            return token;
224          } else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
225            token.type = TokenTypeEnum.Double;
226            return token;
227          }
228
229          // nothing worked
230          return token;
231        }
232      }
233
234      public Token Peek() {
235        return tokens[0];
236      }
237
238      public Token Next() {
239        Token next = tokens[0];
240        tokens.RemoveAt(0);
241        if (tokens.Count == 0) {
242          ReadNextTokens();
243        }
244        return next;
245      }
246
247      public bool HasNext() {
248        return tokens.Count > 0 || !reader.EndOfStream;
249      }
250    }
251    #endregion
252
253    #region parsing
254    private void Parse(bool strict) {
255      ParseMetaData(strict);
256      ParseSampleData(strict);
257    }
258
259    private void ParseSampleData(bool strict) {
260      List<double> row = new List<double>();
261      while (tokenizer.HasNext()) {
262        Token current = tokenizer.Next();
263        if (current.type == TokenTypeEnum.Double) {
264          // just take the value
265          row.Add(current.doubleValue);
266        } else if (current.type == TokenTypeEnum.Int) {
267          // translate the int value to double
268          row.Add((double)current.intValue);
269        } else if (current == Tokenizer.NewlineToken) {
270          // when parsing strictly all rows have to have the same number of values           
271          if (strict) {
272            // the first row defines how many samples are needed
273            if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
274              Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
275                "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.");
276            }
277          } else if (samplesList.Count > 0) {
278            // when we are not strict then fill or drop elements as needed
279            if (samplesList[0].Count > row.Count) {
280              // fill with NAN
281              for (int i = row.Count; i < samplesList[0].Count; i++) {
282                row.Add(double.NaN);
283              }
284            } else if (samplesList[0].Count < row.Count) {
285              // drop last k elements where k = n - length of first row
286              row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
287            }
288          }
289
290          // add the current row to the collection of rows and start a new row
291          samplesList.Add(row);
292          row = new List<double>();
293        } else {
294          // found an unexpected token => return false when parsing strictly
295          // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
296          if (strict) {
297            Error("Unkown value " + current + " in line " + tokenizer.CurrentLineNumber +
298              "\n" + tokenizer.CurrentLine);
299          } else {
300            row.Add(double.NaN);
301          }
302        }
303      }
304    }
305
306    private void ParseMetaData(bool strict) {
307      while (tokenizer.Peek() == Tokenizer.AtToken) {
308        Expect(Tokenizer.AtToken);
309
310        Token nameToken = tokenizer.Next();
311        if (nameToken.type != TokenTypeEnum.String)
312          throw new Exception("Expected a variable name; got " + nameToken +
313            "\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
314
315        Expect(Tokenizer.AssignmentToken);
316
317        List<Token> tokens = new List<Token>();
318        Token valueToken = tokenizer.Next();
319        while (valueToken != Tokenizer.NewlineToken) {
320          tokens.Add(valueToken);
321          valueToken = tokenizer.Next();
322        }
323
324        metadata[nameToken.stringValue] = tokens;
325      }
326    }
327
328    private void Expect(Token expectedToken) {
329      Token actualToken = tokenizer.Next();
330      if (actualToken != expectedToken) {
331        Error("Expected: " + expectedToken + " got: " + actualToken +
332          "\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
333      }
334    }
335
336    private void Error(string message) {
337      throw new Exception("Error while parsing.\n" + message);
338    }
339    #endregion
340  }
341}
Note: See TracBrowser for help on using the repository browser.