Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/3.2/DatasetParser.cs @ 3199

Last change on this file since 3199 was 2446, checked in by gkronber, 15 years ago

Fixed #788 (HL2 StructId problem importer doesn't work correctly if the first variable has a missing value in the first row.)

File size: 14.8 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
[2446]26using System.Linq;
[2]27using HeuristicLab.Data;
[2446]28using System.Text;
[2]29
30namespace HeuristicLab.DataAnalysis {
31  public class DatasetParser {
[273]32    private const string PROBLEMNAME = "PROBLEMNAME";
33    private const string VARIABLENAMES = "VARIABLENAMES";
34    private const string TARGETVARIABLE = "TARGETVARIABLE";
35    private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
36    private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
37    private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
38    private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
[363]39    private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
40    private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
[397]41    private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";
42    private const string TESTSAMPLESEND = "TESTSAMPLESEND";
[487]43    private const string NONINPUTVARIABLES = "NONINPUTVARIABLES";
[2]44    private Tokenizer tokenizer;
45    private Dictionary<string, List<Token>> metadata;
46    private List<List<double>> samplesList;
47
48    private int rows;
49    public int Rows {
50      get { return rows; }
51      set { rows = value; }
52    }
53
54    private int columns;
55    public int Columns {
56      get { return columns; }
57      set { columns = value; }
58    }
59
60    private double[] samples;
61    public double[] Samples {
62      get {
63        return samples;
64      }
65    }
66
67    public string ProblemName {
68      get {
[1221]69        if (metadata.ContainsKey(PROBLEMNAME)) {
[273]70          return metadata[PROBLEMNAME][0].stringValue;
71        } else return "-";
[2]72      }
73    }
74
75    public string[] VariableNames {
76      get {
[1221]77        if (metadata.ContainsKey(VARIABLENAMES)) {
[273]78          List<Token> nameList = metadata[VARIABLENAMES];
79          string[] names = new string[nameList.Count];
[1221]80          for (int i = 0; i < names.Length; i++) {
[273]81            names[i] = nameList[i].stringValue;
82          }
83          return names;
84        } else {
85          string[] names = new string[columns];
[1221]86          for (int i = 0; i < names.Length; i++) {
[273]87            names[i] = "X" + i.ToString("000");
88          }
89          return names;
[2]90        }
91      }
92    }
93
94    public int TargetVariable {
95      get {
[1221]96        if (metadata.ContainsKey(TARGETVARIABLE)) {
[273]97          return metadata[TARGETVARIABLE][0].intValue;
98        } else return 0; // default is the first column
[2]99      }
100    }
101
102    public int MaxTreeHeight {
103      get {
[1221]104        if (metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
[273]105          return metadata[MAXIMUMTREEHEIGHT][0].intValue;
106        } else return 0;
[2]107      }
108    }
109
110    public int MaxTreeSize {
111      get {
[1221]112        if (metadata.ContainsKey(MAXIMUMTREESIZE)) {
[273]113          return metadata[MAXIMUMTREESIZE][0].intValue;
114        } else return 0;
[2]115      }
116    }
117
118    public int TrainingSamplesStart {
119      get {
[1221]120        if (metadata.ContainsKey(TRAININGSAMPLESSTART)) {
[273]121          return metadata[TRAININGSAMPLESSTART][0].intValue;
122        } else return 0;
[2]123      }
124    }
125
126    public int TrainingSamplesEnd {
127      get {
[1221]128        if (metadata.ContainsKey(TRAININGSAMPLESEND)) {
[273]129          return metadata[TRAININGSAMPLESEND][0].intValue;
130        } else return rows;
[2]131      }
132    }
[363]133    public int ValidationSamplesStart {
134      get {
[1221]135        if (metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
[363]136          return metadata[VALIDATIONSAMPLESSTART][0].intValue;
137        } else return 0;
138      }
139    }
[2]140
[363]141    public int ValidationSamplesEnd {
142      get {
[1221]143        if (metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
[363]144          return metadata[VALIDATIONSAMPLESEND][0].intValue;
145        } else return rows;
146      }
147    }
[397]148    public int TestSamplesStart {
149      get {
[1221]150        if (metadata.ContainsKey(TESTSAMPLESSTART)) {
[397]151          return metadata[TESTSAMPLESSTART][0].intValue;
152        } else return 0;
153      }
154    }
[363]155
[397]156    public int TestSamplesEnd {
157      get {
[1221]158        if (metadata.ContainsKey(TESTSAMPLESEND)) {
[397]159          return metadata[TESTSAMPLESEND][0].intValue;
160        } else return rows;
161      }
162    }
163
[487]164    public List<int> NonInputVariables {
165      get {
166        List<int> disallowedVariables = new List<int>();
[1221]167        if (metadata.ContainsKey(NONINPUTVARIABLES)) {
168          foreach (Token t in metadata[NONINPUTVARIABLES]) {
[487]169            disallowedVariables.Add(t.intValue);
170          }
171        }
172        return disallowedVariables;
173      }
174    }
175
[2]176    public DatasetParser() {
177      this.metadata = new Dictionary<string, List<Token>>();
178      samplesList = new List<List<double>>();
179    }
180
[404]181    public void Reset() {
182      metadata.Clear();
183      samplesList.Clear();
184    }
185
[2]186    public void Import(string importFileName, bool strict) {
[405]187      TryParse(importFileName, strict);
[2]188      // translate the list of samples into a DoubleMatrixData item
189      samples = new double[samplesList.Count * samplesList[0].Count];
190      rows = samplesList.Count;
191      columns = samplesList[0].Count;
192
193      int i = 0;
194      int j = 0;
[1221]195      foreach (List<double> row in samplesList) {
[2]196        j = 0;
[1221]197        foreach (double element in row) {
[2]198          samples[i * columns + j] = element;
199          j++;
200        }
201        i++;
202      }
203    }
204
[405]205    private void TryParse(string importFileName, bool strict) {
206      Exception lastEx = null;
207      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo, CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo };
[1221]208      foreach (NumberFormatInfo numberFormat in possibleFormats) {
209        using (StreamReader reader = new StreamReader(importFileName)) {
[405]210          tokenizer = new Tokenizer(reader, numberFormat);
211          try {
212            // parse the file
213            Parse(strict);
214            return; // parsed without errors -> return;
[1221]215          }
216          catch (DataFormatException ex) {
[405]217            lastEx = ex;
218          }
219        }
220      }
221      // all number formats threw an exception -> rethrow the last exception
222      throw lastEx;
223    }
224
[2]225    #region tokenizer
226    internal enum TokenTypeEnum {
[2446]227      At, Assign, NewLine, SemiColon, String, Double, Int
[2]228    }
229
230    internal class Token {
231      public TokenTypeEnum type;
232      public string stringValue;
233      public double doubleValue;
234      public int intValue;
235
236      public Token(TokenTypeEnum type, string value) {
237        this.type = type;
238        stringValue = value;
239        doubleValue = 0.0;
240        intValue = 0;
241      }
242
243      public override string ToString() {
244        return stringValue;
245      }
246    }
247
248
249    class Tokenizer {
250      private StreamReader reader;
251      private List<Token> tokens;
[405]252      private NumberFormatInfo numberFormatInfo;
[2]253
254      public int CurrentLineNumber = 0;
255      public string CurrentLine;
256
257      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
258      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
259      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
[2446]260      public static Token SeparatorToken = new Token(TokenTypeEnum.SemiColon, ";");
[2]261
[405]262      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) {
[2]263        this.reader = reader;
[405]264        this.numberFormatInfo = numberFormatInfo;
[2]265        tokens = new List<Token>();
266        ReadNextTokens();
267      }
268
269      private void ReadNextTokens() {
[1221]270        if (!reader.EndOfStream) {
[2]271          CurrentLine = reader.ReadLine();
[2446]272          var newTokens = from str in Split(CurrentLine)
273                          let trimmedStr = str.Trim()
274                          where !string.IsNullOrEmpty(trimmedStr)
275                          select MakeToken(trimmedStr.Trim());
[2]276
[2446]277          tokens.AddRange(newTokens);
[2]278          tokens.Add(NewlineToken);
279          CurrentLineNumber++;
280        }
281      }
282
[2446]283      private IEnumerable<string> Split(string line) {
284        StringBuilder subStr = new StringBuilder();
285        foreach (char c in line) {
286          if (c == '@' || c == '=' || c == ';') {
287            yield return subStr.ToString();
288            subStr = new StringBuilder();
289            yield return c.ToString();
290          } else {
291            subStr.Append(c);
292          }
293        }
294        yield return subStr.ToString();
295      }
296
[2]297      private Token MakeToken(string strToken) {
[406]298        Token token = new Token(TokenTypeEnum.String, strToken);
[2446]299        if (strToken.Equals(AtToken.stringValue)) {
300          return AtToken;
301        } else if (strToken.Equals(AssignmentToken.stringValue)) {
302          return AssignmentToken;
303        } else if (strToken.Equals(SeparatorToken.stringValue)) {
304          return SeparatorToken;
305        } else if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) {
[406]306          token.type = TokenTypeEnum.Int;
[2]307          return token;
[1221]308        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]309          token.type = TokenTypeEnum.Double;
310          return token;
[2]311        }
[2446]312
[406]313        // couldn't parse the token as an int or float number so return a string token
314        return token;
[2]315      }
316
317      public Token Peek() {
318        return tokens[0];
319      }
320
321      public Token Next() {
322        Token next = tokens[0];
323        tokens.RemoveAt(0);
[1221]324        if (tokens.Count == 0) {
[2]325          ReadNextTokens();
326        }
327        return next;
328      }
329
330      public bool HasNext() {
331        return tokens.Count > 0 || !reader.EndOfStream;
332      }
333    }
334    #endregion
335
336    #region parsing
337    private void Parse(bool strict) {
338      ParseMetaData(strict);
[1221]339      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]340      ParseSampleData(strict);
[1221]341      if (samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]342    }
343
344    private void ParseSampleData(bool strict) {
[1221]345      while (tokenizer.HasNext()) {
[2446]346        List<double> row = new List<double>();
347        row.Add(NextValue(tokenizer, strict));
348        while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) {
349          Expect(Tokenizer.SeparatorToken);
350          row.Add(NextValue(tokenizer, strict));
351        }
352        Expect(Tokenizer.NewlineToken);
353        // when parsing strictly all rows have to have the same number of values           
354        if (strict) {
355          // the first row defines how many samples are needed
356          if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
357            Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
358              "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
359          }
360        } else if (samplesList.Count > 0) {
361          // when we are not strict then fill or drop elements as needed
362          if (samplesList[0].Count > row.Count) {
363            // fill with NAN
364            for (int i = row.Count; i < samplesList[0].Count; i++) {
365              row.Add(double.NaN);
[2]366            }
[2446]367          } else if (samplesList[0].Count < row.Count) {
368            // drop last k elements where k = n - length of first row
369            row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
[2]370          }
[2446]371        }
[2]372
[2446]373        // add the current row to the collection of rows and start a new row
374        samplesList.Add(row);
375        row = new List<double>();
376      }
377    }
378
379    private double NextValue(Tokenizer tokenizer, bool strict) {
380      if (tokenizer.Peek() == Tokenizer.SeparatorToken || tokenizer.Peek() == Tokenizer.NewlineToken) return double.NaN;
381      Token current = tokenizer.Next();
382      if (current.type == TokenTypeEnum.SemiColon || current.type == TokenTypeEnum.String) {
383        return double.NaN;
384      } else if (current.type == TokenTypeEnum.Double) {
385        // just take the value
386        return current.doubleValue;
387      } else if (current.type == TokenTypeEnum.Int) {
388        // translate the int value to double
389        return (double)current.intValue;
390      } else {
391        // found an unexpected token => throw error when parsing strictly
392        // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
393        if (strict) {
394          Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
[2]395        } else {
[2446]396          return double.NaN;
[2]397        }
398      }
[2446]399      return double.NaN;
[2]400    }
401
402    private void ParseMetaData(bool strict) {
[2446]403      while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.AtToken) {
404        Expect(Tokenizer.AtToken);
405
[2]406        Token nameToken = tokenizer.Next();
[2446]407        Expect(Tokenizer.AssignmentToken);
[2]408
409        List<Token> tokens = new List<Token>();
[1221]410        Token valueToken;
411        valueToken = tokenizer.Next();
[2446]412        tokens.Add(valueToken);
413        while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) {
414          Expect(Tokenizer.SeparatorToken);
[2]415          valueToken = tokenizer.Next();
[2446]416          if (valueToken != Tokenizer.NewlineToken) {
417            tokens.Add(valueToken);
418          }
[2]419        }
[2446]420        if (valueToken != Tokenizer.NewlineToken) {
421          Expect(Tokenizer.NewlineToken);
422        }
[2]423        metadata[nameToken.stringValue] = tokens;
424      }
425    }
426
427    private void Expect(Token expectedToken) {
428      Token actualToken = tokenizer.Next();
[1221]429      if (actualToken != expectedToken) {
[273]430        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]431      }
432    }
433
[273]434    private void Error(string message, string token, int lineNumber) {
435      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]436    }
437    #endregion
438  }
439}
Note: See TracBrowser for help on using the repository browser.