Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 308

Last change on this file since 308 was 273, checked in by gkronber, 16 years ago

fixed #160

File size: 11.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using HeuristicLab.Data;
27
28namespace HeuristicLab.DataAnalysis {
29  public class DatasetParser {
30    private const string PROBLEMNAME = "PROBLEMNAME";
31    private const string VARIABLENAMES = "VARIABLENAMES";
32    private const string TARGETVARIABLE = "TARGETVARIABLE";
33    private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
34    private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
35    private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
36    private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
37    private Tokenizer tokenizer;
38    private Dictionary<string, List<Token>> metadata;
39    private List<List<double>> samplesList;
40
41    private int rows;
42    public int Rows {
43      get { return rows; }
44      set { rows = value; }
45    }
46
47    private int columns;
48    public int Columns {
49      get { return columns; }
50      set { columns = value; }
51    }
52
53    private double[] samples;
54    public double[] Samples {
55      get {
56        return samples;
57      }
58    }
59
60    public string ProblemName {
61      get {
62        if(metadata.ContainsKey(PROBLEMNAME)) {
63          return metadata[PROBLEMNAME][0].stringValue;
64        } else return "-";
65      }
66    }
67
68    public string[] VariableNames {
69      get {
70        if(metadata.ContainsKey(VARIABLENAMES)) {
71          List<Token> nameList = metadata[VARIABLENAMES];
72          string[] names = new string[nameList.Count];
73          for(int i = 0; i < names.Length; i++) {
74            names[i] = nameList[i].stringValue;
75          }
76          return names;
77        } else {
78          string[] names = new string[columns];
79          for(int i = 0; i < names.Length; i++) {
80            names[i] = "X" + i.ToString("000");
81          }
82          return names;
83        }
84      }
85    }
86
87    public int TargetVariable {
88      get {
89        if(metadata.ContainsKey(TARGETVARIABLE)) {
90          return metadata[TARGETVARIABLE][0].intValue;
91        } else return 0; // default is the first column
92      }
93    }
94
95    public int MaxTreeHeight {
96      get {
97        if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
98          return metadata[MAXIMUMTREEHEIGHT][0].intValue;
99        } else return 0;
100      }
101    }
102
103    public int MaxTreeSize {
104      get {
105        if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
106          return metadata[MAXIMUMTREESIZE][0].intValue;
107        } else return 0;
108      }
109    }
110
111    public int TrainingSamplesStart {
112      get {
113        if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
114          return metadata[TRAININGSAMPLESSTART][0].intValue;
115        } else return 0;
116      }
117    }
118
119    public int TrainingSamplesEnd {
120      get {
121        if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
122          return metadata[TRAININGSAMPLESEND][0].intValue;
123        } else return rows;
124      }
125    }
126
127    public DatasetParser() {
128      this.metadata = new Dictionary<string, List<Token>>();
129      samplesList = new List<List<double>>();
130    }
131
132    public void Import(string importFileName, bool strict) {
133      StreamReader reader = new StreamReader(importFileName);
134      this.tokenizer = new Tokenizer(reader);
135      tokenizer.Separators = new string[] { " ", ";", "\t" };
136
137      try {
138        // parse the file
139        Parse(strict);
140      } finally {
141        reader.Close();
142      }
143
144      // translate the list of samples into a DoubleMatrixData item
145      samples = new double[samplesList.Count * samplesList[0].Count];
146      rows = samplesList.Count;
147      columns = samplesList[0].Count;
148
149      int i = 0;
150      int j = 0;
151      foreach(List<double> row in samplesList) {
152        j = 0;
153        foreach(double element in row) {
154          samples[i * columns + j] = element;
155          j++;
156        }
157        i++;
158      }
159    }
160
161    #region tokenizer
162    internal enum TokenTypeEnum {
163      At, Assign, NewLine, String, Double, Int
164    }
165
166    internal class Token {
167      public TokenTypeEnum type;
168      public string stringValue;
169      public double doubleValue;
170      public int intValue;
171
172      public Token(TokenTypeEnum type, string value) {
173        this.type = type;
174        stringValue = value;
175        doubleValue = 0.0;
176        intValue = 0;
177      }
178
179      public override string ToString() {
180        return stringValue;
181      }
182    }
183
184
185    class Tokenizer {
186      private StreamReader reader;
187      private List<Token> tokens;
188      private string[] separators;
189
190      public int CurrentLineNumber = 0;
191      public string CurrentLine;
192
193      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
194      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
195      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
196
197      public string[] Separators {
198        get { return separators; }
199        set { separators = value; }
200      }
201
202
203      public Tokenizer(StreamReader reader) {
204        this.reader = reader;
205        tokens = new List<Token>();
206        ReadNextTokens();
207      }
208
209      private void ReadNextTokens() {
210        if(!reader.EndOfStream) {
211          CurrentLine = reader.ReadLine();
212          Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
213            return MakeToken(str);
214          });
215
216          tokens.AddRange(newTokens);
217          tokens.Add(NewlineToken);
218          CurrentLineNumber++;
219        }
220      }
221
222      private Token MakeToken(string strToken) {
223        if(strToken == "@")
224          return AtToken;
225        else if(strToken == "=")
226          return AssignmentToken;
227        else {
228          Token token = new Token(TokenTypeEnum.String, strToken);
229
230          // try invariant culture
231          NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
232          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
233            token.type = TokenTypeEnum.Int;
234            return token;
235          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
236            token.type = TokenTypeEnum.Double;
237            return token;
238          }
239          // try german culture
240          currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
241          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
242            token.type = TokenTypeEnum.Int;
243            return token;
244          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
245            token.type = TokenTypeEnum.Double;
246            return token;
247          }
248
249          // try current culture
250          currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
251          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
252            token.type = TokenTypeEnum.Int;
253            return token;
254          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
255            token.type = TokenTypeEnum.Double;
256            return token;
257          }
258
259          // nothing worked
260          return token;
261        }
262      }
263
264      public Token Peek() {
265        return tokens[0];
266      }
267
268      public Token Next() {
269        Token next = tokens[0];
270        tokens.RemoveAt(0);
271        if(tokens.Count == 0) {
272          ReadNextTokens();
273        }
274        return next;
275      }
276
277      public bool HasNext() {
278        return tokens.Count > 0 || !reader.EndOfStream;
279      }
280    }
281    #endregion
282
283    #region parsing
284    private void Parse(bool strict) {
285      ParseMetaData(strict);
286      ParseSampleData(strict);
287    }
288
289    private void ParseSampleData(bool strict) {
290      List<double> row = new List<double>();
291      while(tokenizer.HasNext()) {
292        Token current = tokenizer.Next();
293        if(current.type == TokenTypeEnum.Double) {
294          // just take the value
295          row.Add(current.doubleValue);
296        } else if(current.type == TokenTypeEnum.Int) {
297          // translate the int value to double
298          row.Add((double)current.intValue);
299        } else if(current == Tokenizer.NewlineToken) {
300          // when parsing strictly all rows have to have the same number of values           
301          if(strict) {
302            // the first row defines how many samples are needed
303            if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
304              Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
305                "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
306            }
307          } else if(samplesList.Count > 0) {
308            // when we are not strict then fill or drop elements as needed
309            if(samplesList[0].Count > row.Count) {
310              // fill with NAN
311              for(int i = row.Count; i < samplesList[0].Count; i++) {
312                row.Add(double.NaN);
313              }
314            } else if(samplesList[0].Count < row.Count) {
315              // drop last k elements where k = n - length of first row
316              row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
317            }
318          }
319
320          // add the current row to the collection of rows and start a new row
321          samplesList.Add(row);
322          row = new List<double>();
323        } else {
324          // found an unexpected token => return false when parsing strictly
325          // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
326          if(strict) {
327            Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
328          } else {
329            row.Add(double.NaN);
330          }
331        }
332      }
333    }
334
335    private void ParseMetaData(bool strict) {
336      while(tokenizer.Peek() == Tokenizer.AtToken) {
337        Expect(Tokenizer.AtToken);
338
339        Token nameToken = tokenizer.Next();
340        if(nameToken.type != TokenTypeEnum.String)
341          Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
342
343        Expect(Tokenizer.AssignmentToken);
344
345        List<Token> tokens = new List<Token>();
346        Token valueToken = tokenizer.Next();
347        while(valueToken != Tokenizer.NewlineToken) {
348          tokens.Add(valueToken);
349          valueToken = tokenizer.Next();
350        }
351
352        metadata[nameToken.stringValue] = tokens;
353      }
354    }
355
356    private void Expect(Token expectedToken) {
357      Token actualToken = tokenizer.Next();
358      if(actualToken != expectedToken) {
359        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
360      }
361    }
362
363    private void Error(string message, string token, int lineNumber) {
364      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
365    }
366    #endregion
367  }
368}
Note: See TracBrowser for help on using the repository browser.