Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 397

Last change on this file since 397 was 397, checked in by gkronber, 16 years ago

fixed #206

File size: 13.0 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using HeuristicLab.Data;
27
28namespace HeuristicLab.DataAnalysis {
29  public class DatasetParser {
30    private const string PROBLEMNAME = "PROBLEMNAME";
31    private const string VARIABLENAMES = "VARIABLENAMES";
32    private const string TARGETVARIABLE = "TARGETVARIABLE";
33    private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
34    private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
35    private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
36    private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
37    private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
38    private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
39    private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";
40    private const string TESTSAMPLESEND = "TESTSAMPLESEND";
41    private Tokenizer tokenizer;
42    private Dictionary<string, List<Token>> metadata;
43    private List<List<double>> samplesList;
44
45    private int rows;
46    public int Rows {
47      get { return rows; }
48      set { rows = value; }
49    }
50
51    private int columns;
52    public int Columns {
53      get { return columns; }
54      set { columns = value; }
55    }
56
57    private double[] samples;
58    public double[] Samples {
59      get {
60        return samples;
61      }
62    }
63
64    public string ProblemName {
65      get {
66        if(metadata.ContainsKey(PROBLEMNAME)) {
67          return metadata[PROBLEMNAME][0].stringValue;
68        } else return "-";
69      }
70    }
71
72    public string[] VariableNames {
73      get {
74        if(metadata.ContainsKey(VARIABLENAMES)) {
75          List<Token> nameList = metadata[VARIABLENAMES];
76          string[] names = new string[nameList.Count];
77          for(int i = 0; i < names.Length; i++) {
78            names[i] = nameList[i].stringValue;
79          }
80          return names;
81        } else {
82          string[] names = new string[columns];
83          for(int i = 0; i < names.Length; i++) {
84            names[i] = "X" + i.ToString("000");
85          }
86          return names;
87        }
88      }
89    }
90
91    public int TargetVariable {
92      get {
93        if(metadata.ContainsKey(TARGETVARIABLE)) {
94          return metadata[TARGETVARIABLE][0].intValue;
95        } else return 0; // default is the first column
96      }
97    }
98
99    public int MaxTreeHeight {
100      get {
101        if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
102          return metadata[MAXIMUMTREEHEIGHT][0].intValue;
103        } else return 0;
104      }
105    }
106
107    public int MaxTreeSize {
108      get {
109        if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
110          return metadata[MAXIMUMTREESIZE][0].intValue;
111        } else return 0;
112      }
113    }
114
115    public int TrainingSamplesStart {
116      get {
117        if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
118          return metadata[TRAININGSAMPLESSTART][0].intValue;
119        } else return 0;
120      }
121    }
122
123    public int TrainingSamplesEnd {
124      get {
125        if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
126          return metadata[TRAININGSAMPLESEND][0].intValue;
127        } else return rows;
128      }
129    }
130    public int ValidationSamplesStart {
131      get {
132        if(metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
133          return metadata[VALIDATIONSAMPLESSTART][0].intValue;
134        } else return 0;
135      }
136    }
137
138    public int ValidationSamplesEnd {
139      get {
140        if(metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
141          return metadata[VALIDATIONSAMPLESEND][0].intValue;
142        } else return rows;
143      }
144    }
145    public int TestSamplesStart {
146      get {
147        if(metadata.ContainsKey(TESTSAMPLESSTART)) {
148          return metadata[TESTSAMPLESSTART][0].intValue;
149        } else return 0;
150      }
151    }
152
153    public int TestSamplesEnd {
154      get {
155        if(metadata.ContainsKey(TESTSAMPLESEND)) {
156          return metadata[TESTSAMPLESEND][0].intValue;
157        } else return rows;
158      }
159    }
160
161    public DatasetParser() {
162      this.metadata = new Dictionary<string, List<Token>>();
163      samplesList = new List<List<double>>();
164    }
165
166    public void Import(string importFileName, bool strict) {
167      StreamReader reader = new StreamReader(importFileName);
168      this.tokenizer = new Tokenizer(reader);
169      tokenizer.Separators = new string[] { " ", ";", "\t" };
170
171      try {
172        // parse the file
173        Parse(strict);
174      } finally {
175        reader.Close();
176      }
177
178      // translate the list of samples into a DoubleMatrixData item
179      samples = new double[samplesList.Count * samplesList[0].Count];
180      rows = samplesList.Count;
181      columns = samplesList[0].Count;
182
183      int i = 0;
184      int j = 0;
185      foreach(List<double> row in samplesList) {
186        j = 0;
187        foreach(double element in row) {
188          samples[i * columns + j] = element;
189          j++;
190        }
191        i++;
192      }
193    }
194
195    #region tokenizer
196    internal enum TokenTypeEnum {
197      At, Assign, NewLine, String, Double, Int
198    }
199
200    internal class Token {
201      public TokenTypeEnum type;
202      public string stringValue;
203      public double doubleValue;
204      public int intValue;
205
206      public Token(TokenTypeEnum type, string value) {
207        this.type = type;
208        stringValue = value;
209        doubleValue = 0.0;
210        intValue = 0;
211      }
212
213      public override string ToString() {
214        return stringValue;
215      }
216    }
217
218
219    class Tokenizer {
220      private StreamReader reader;
221      private List<Token> tokens;
222      private string[] separators;
223
224      public int CurrentLineNumber = 0;
225      public string CurrentLine;
226
227      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
228      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
229      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
230
231      public string[] Separators {
232        get { return separators; }
233        set { separators = value; }
234      }
235
236
237      public Tokenizer(StreamReader reader) {
238        this.reader = reader;
239        tokens = new List<Token>();
240        ReadNextTokens();
241      }
242
243      private void ReadNextTokens() {
244        if(!reader.EndOfStream) {
245          CurrentLine = reader.ReadLine();
246          Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
247            return MakeToken(str);
248          });
249
250          tokens.AddRange(newTokens);
251          tokens.Add(NewlineToken);
252          CurrentLineNumber++;
253        }
254      }
255
256      private Token MakeToken(string strToken) {
257        if(strToken == "@")
258          return AtToken;
259        else if(strToken == "=")
260          return AssignmentToken;
261        else {
262          Token token = new Token(TokenTypeEnum.String, strToken);
263
264          // try invariant culture
265          NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
266          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
267            token.type = TokenTypeEnum.Int;
268            return token;
269          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
270            token.type = TokenTypeEnum.Double;
271            return token;
272          }
273          // try german culture
274          currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
275          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
276            token.type = TokenTypeEnum.Int;
277            return token;
278          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
279            token.type = TokenTypeEnum.Double;
280            return token;
281          }
282
283          // try current culture
284          currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
285          if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
286            token.type = TokenTypeEnum.Int;
287            return token;
288          } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
289            token.type = TokenTypeEnum.Double;
290            return token;
291          }
292
293          // nothing worked
294          return token;
295        }
296      }
297
298      public Token Peek() {
299        return tokens[0];
300      }
301
302      public Token Next() {
303        Token next = tokens[0];
304        tokens.RemoveAt(0);
305        if(tokens.Count == 0) {
306          ReadNextTokens();
307        }
308        return next;
309      }
310
311      public bool HasNext() {
312        return tokens.Count > 0 || !reader.EndOfStream;
313      }
314    }
315    #endregion
316
317    #region parsing
318    private void Parse(bool strict) {
319      ParseMetaData(strict);
320      ParseSampleData(strict);
321    }
322
323    private void ParseSampleData(bool strict) {
324      List<double> row = new List<double>();
325      while(tokenizer.HasNext()) {
326        Token current = tokenizer.Next();
327        if(current.type == TokenTypeEnum.Double) {
328          // just take the value
329          row.Add(current.doubleValue);
330        } else if(current.type == TokenTypeEnum.Int) {
331          // translate the int value to double
332          row.Add((double)current.intValue);
333        } else if(current == Tokenizer.NewlineToken) {
334          // when parsing strictly all rows have to have the same number of values           
335          if(strict) {
336            // the first row defines how many samples are needed
337            if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
338              Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
339                "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
340            }
341          } else if(samplesList.Count > 0) {
342            // when we are not strict then fill or drop elements as needed
343            if(samplesList[0].Count > row.Count) {
344              // fill with NAN
345              for(int i = row.Count; i < samplesList[0].Count; i++) {
346                row.Add(double.NaN);
347              }
348            } else if(samplesList[0].Count < row.Count) {
349              // drop last k elements where k = n - length of first row
350              row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
351            }
352          }
353
354          // add the current row to the collection of rows and start a new row
355          samplesList.Add(row);
356          row = new List<double>();
357        } else {
358          // found an unexpected token => return false when parsing strictly
359          // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
360          if(strict) {
361            Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
362          } else {
363            row.Add(double.NaN);
364          }
365        }
366      }
367    }
368
369    private void ParseMetaData(bool strict) {
370      while(tokenizer.Peek() == Tokenizer.AtToken) {
371        Expect(Tokenizer.AtToken);
372
373        Token nameToken = tokenizer.Next();
374        if(nameToken.type != TokenTypeEnum.String)
375          Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
376
377        Expect(Tokenizer.AssignmentToken);
378
379        List<Token> tokens = new List<Token>();
380        Token valueToken = tokenizer.Next();
381        while(valueToken != Tokenizer.NewlineToken) {
382          tokens.Add(valueToken);
383          valueToken = tokenizer.Next();
384        }
385
386        metadata[nameToken.stringValue] = tokens;
387      }
388    }
389
390    private void Expect(Token expectedToken) {
391      Token actualToken = tokenizer.Next();
392      if(actualToken != expectedToken) {
393        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
394      }
395    }
396
397    private void Error(string message, string token, int lineNumber) {
398      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
399    }
400    #endregion
401  }
402}
Note: See TracBrowser for help on using the repository browser.