Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.DataAnalysis/3.2/DatasetParser.cs @ 2219

Last change on this file since 2219 was 1529, checked in by gkronber, 16 years ago

Moved source files of plugins AdvancedOptimizationFrontEnd ... Grid into version-specific sub-folders. #576

File size: 14.2 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using HeuristicLab.Data;
27
28namespace HeuristicLab.DataAnalysis {
29  public class DatasetParser {
[273]30    private const string PROBLEMNAME = "PROBLEMNAME";
31    private const string VARIABLENAMES = "VARIABLENAMES";
32    private const string TARGETVARIABLE = "TARGETVARIABLE";
33    private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
34    private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
35    private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
36    private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
[363]37    private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
38    private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
[397]39    private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";
40    private const string TESTSAMPLESEND = "TESTSAMPLESEND";
[487]41    private const string NONINPUTVARIABLES = "NONINPUTVARIABLES";
[2]42    private Tokenizer tokenizer;
43    private Dictionary<string, List<Token>> metadata;
44    private List<List<double>> samplesList;
45
46    private int rows;
47    public int Rows {
48      get { return rows; }
49      set { rows = value; }
50    }
51
52    private int columns;
53    public int Columns {
54      get { return columns; }
55      set { columns = value; }
56    }
57
58    private double[] samples;
59    public double[] Samples {
60      get {
61        return samples;
62      }
63    }
64
65    public string ProblemName {
66      get {
[1221]67        if (metadata.ContainsKey(PROBLEMNAME)) {
[273]68          return metadata[PROBLEMNAME][0].stringValue;
69        } else return "-";
[2]70      }
71    }
72
73    public string[] VariableNames {
74      get {
[1221]75        if (metadata.ContainsKey(VARIABLENAMES)) {
[273]76          List<Token> nameList = metadata[VARIABLENAMES];
77          string[] names = new string[nameList.Count];
[1221]78          for (int i = 0; i < names.Length; i++) {
[273]79            names[i] = nameList[i].stringValue;
80          }
81          return names;
82        } else {
83          string[] names = new string[columns];
[1221]84          for (int i = 0; i < names.Length; i++) {
[273]85            names[i] = "X" + i.ToString("000");
86          }
87          return names;
[2]88        }
89      }
90    }
91
92    public int TargetVariable {
93      get {
[1221]94        if (metadata.ContainsKey(TARGETVARIABLE)) {
[273]95          return metadata[TARGETVARIABLE][0].intValue;
96        } else return 0; // default is the first column
[2]97      }
98    }
99
100    public int MaxTreeHeight {
101      get {
[1221]102        if (metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
[273]103          return metadata[MAXIMUMTREEHEIGHT][0].intValue;
104        } else return 0;
[2]105      }
106    }
107
108    public int MaxTreeSize {
109      get {
[1221]110        if (metadata.ContainsKey(MAXIMUMTREESIZE)) {
[273]111          return metadata[MAXIMUMTREESIZE][0].intValue;
112        } else return 0;
[2]113      }
114    }
115
116    public int TrainingSamplesStart {
117      get {
[1221]118        if (metadata.ContainsKey(TRAININGSAMPLESSTART)) {
[273]119          return metadata[TRAININGSAMPLESSTART][0].intValue;
120        } else return 0;
[2]121      }
122    }
123
124    public int TrainingSamplesEnd {
125      get {
[1221]126        if (metadata.ContainsKey(TRAININGSAMPLESEND)) {
[273]127          return metadata[TRAININGSAMPLESEND][0].intValue;
128        } else return rows;
[2]129      }
130    }
[363]131    public int ValidationSamplesStart {
132      get {
[1221]133        if (metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
[363]134          return metadata[VALIDATIONSAMPLESSTART][0].intValue;
135        } else return 0;
136      }
137    }
[2]138
[363]139    public int ValidationSamplesEnd {
140      get {
[1221]141        if (metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
[363]142          return metadata[VALIDATIONSAMPLESEND][0].intValue;
143        } else return rows;
144      }
145    }
[397]146    public int TestSamplesStart {
147      get {
[1221]148        if (metadata.ContainsKey(TESTSAMPLESSTART)) {
[397]149          return metadata[TESTSAMPLESSTART][0].intValue;
150        } else return 0;
151      }
152    }
[363]153
[397]154    public int TestSamplesEnd {
155      get {
[1221]156        if (metadata.ContainsKey(TESTSAMPLESEND)) {
[397]157          return metadata[TESTSAMPLESEND][0].intValue;
158        } else return rows;
159      }
160    }
161
[487]162    public List<int> NonInputVariables {
163      get {
164        List<int> disallowedVariables = new List<int>();
[1221]165        if (metadata.ContainsKey(NONINPUTVARIABLES)) {
166          foreach (Token t in metadata[NONINPUTVARIABLES]) {
[487]167            disallowedVariables.Add(t.intValue);
168          }
169        }
170        return disallowedVariables;
171      }
172    }
173
[2]174    public DatasetParser() {
175      this.metadata = new Dictionary<string, List<Token>>();
176      samplesList = new List<List<double>>();
177    }
178
[404]179    public void Reset() {
180      metadata.Clear();
181      samplesList.Clear();
182    }
183
[2]184    public void Import(string importFileName, bool strict) {
[405]185      TryParse(importFileName, strict);
[2]186      // translate the list of samples into a DoubleMatrixData item
187      samples = new double[samplesList.Count * samplesList[0].Count];
188      rows = samplesList.Count;
189      columns = samplesList[0].Count;
190
191      int i = 0;
192      int j = 0;
[1221]193      foreach (List<double> row in samplesList) {
[2]194        j = 0;
[1221]195        foreach (double element in row) {
[2]196          samples[i * columns + j] = element;
197          j++;
198        }
199        i++;
200      }
201    }
202
[405]203    private void TryParse(string importFileName, bool strict) {
204      Exception lastEx = null;
205      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo, CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo };
[1221]206      foreach (NumberFormatInfo numberFormat in possibleFormats) {
207        using (StreamReader reader = new StreamReader(importFileName)) {
[405]208          tokenizer = new Tokenizer(reader, numberFormat);
209          try {
210            // parse the file
211            Parse(strict);
212            return; // parsed without errors -> return;
[1221]213          }
214          catch (DataFormatException ex) {
[405]215            lastEx = ex;
216          }
217        }
218      }
219      // all number formats threw an exception -> rethrow the last exception
220      throw lastEx;
221    }
222
[2]223    #region tokenizer
224    internal enum TokenTypeEnum {
[406]225      At, Assign, NewLine, String, Double, Int, WhiteSpace
[2]226    }
227
228    internal class Token {
229      public TokenTypeEnum type;
230      public string stringValue;
231      public double doubleValue;
232      public int intValue;
233
234      public Token(TokenTypeEnum type, string value) {
235        this.type = type;
236        stringValue = value;
237        doubleValue = 0.0;
238        intValue = 0;
239      }
240
241      public override string ToString() {
242        return stringValue;
243      }
244    }
245
246
247    class Tokenizer {
248      private StreamReader reader;
249      private List<Token> tokens;
[406]250      private string[] separators = new string[] { "@", "=", ";", "\t" };
[405]251      private NumberFormatInfo numberFormatInfo;
[2]252
253      public int CurrentLineNumber = 0;
254      public string CurrentLine;
255
256      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
257      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
258      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
[406]259      public static Token SeparatorToken = new Token(TokenTypeEnum.WhiteSpace, "");
[2]260      public string[] Separators {
261        get { return separators; }
262        set { separators = value; }
263      }
264
265
[405]266      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) {
[2]267        this.reader = reader;
[405]268        this.numberFormatInfo = numberFormatInfo;
[2]269        tokens = new List<Token>();
270        ReadNextTokens();
271      }
272
273      private void ReadNextTokens() {
[1221]274        if (!reader.EndOfStream) {
[2]275          CurrentLine = reader.ReadLine();
[1221]276          Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.None), delegate(string str) {
[406]277            return MakeToken(str.Trim());
[2]278          });
279
[1221]280          foreach (Token tok in newTokens) {
281            if (tok != SeparatorToken) tokens.Add(tok);
[406]282          }
[2]283          tokens.Add(NewlineToken);
284          CurrentLineNumber++;
285        }
286      }
287
288      private Token MakeToken(string strToken) {
[406]289        Token token = new Token(TokenTypeEnum.String, strToken);
[2]290
[406]291        // try to parse as a number first
[1221]292        if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) {
[406]293          token.type = TokenTypeEnum.Int;
[2]294          return token;
[1221]295        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]296          token.type = TokenTypeEnum.Double;
297          return token;
[1221]298        } else if (String.IsNullOrEmpty(strToken)) {
299          token.type = TokenTypeEnum.WhiteSpace;
300          return token;
[2]301        }
[406]302        // couldn't parse the token as an int or float number so return a string token
303        return token;
[2]304      }
305
306      public Token Peek() {
307        return tokens[0];
308      }
309
310      public Token Next() {
311        Token next = tokens[0];
312        tokens.RemoveAt(0);
[1221]313        if (tokens.Count == 0) {
[2]314          ReadNextTokens();
315        }
316        return next;
317      }
318
319      public bool HasNext() {
320        return tokens.Count > 0 || !reader.EndOfStream;
321      }
322    }
323    #endregion
324
325    #region parsing
326    private void Parse(bool strict) {
327      ParseMetaData(strict);
[1221]328      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]329      ParseSampleData(strict);
[1221]330      if (samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]331    }
332
333    private void ParseSampleData(bool strict) {
334      List<double> row = new List<double>();
[1221]335      while (tokenizer.HasNext()) {
[2]336        Token current = tokenizer.Next();
[1221]337        if (current.type == TokenTypeEnum.WhiteSpace) {
338          row.Add(double.NaN);
339        } else if (current.type == TokenTypeEnum.Double) {
[2]340          // just take the value
341          row.Add(current.doubleValue);
[1221]342        } else if (current.type == TokenTypeEnum.Int) {
[2]343          // translate the int value to double
344          row.Add((double)current.intValue);
[1221]345        } else if (current == Tokenizer.NewlineToken) {
[2]346          // when parsing strictly all rows have to have the same number of values           
[1221]347          if (strict) {
[2]348            // the first row defines how many samples are needed
[1221]349            if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
[2]350              Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
[273]351                "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2]352            }
[1221]353          } else if (samplesList.Count > 0) {
[2]354            // when we are not strict then fill or drop elements as needed
[1221]355            if (samplesList[0].Count > row.Count) {
[2]356              // fill with NAN
[1221]357              for (int i = row.Count; i < samplesList[0].Count; i++) {
[2]358                row.Add(double.NaN);
359              }
[1221]360            } else if (samplesList[0].Count < row.Count) {
[2]361              // drop last k elements where k = n - length of first row
362              row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
363            }
364          }
365
366          // add the current row to the collection of rows and start a new row
367          samplesList.Add(row);
368          row = new List<double>();
369        } else {
370          // found an unexpected token => return false when parsing strictly
371          // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
[1221]372          if (strict) {
[273]373            Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
[2]374          } else {
375            row.Add(double.NaN);
376          }
377        }
378      }
379    }
380
381    private void ParseMetaData(bool strict) {
[1221]382      while (tokenizer.HasNext() && (tokenizer.Peek().type == TokenTypeEnum.WhiteSpace || tokenizer.Peek().type == TokenTypeEnum.String)) {
383        while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) tokenizer.Next();
[2]384        Token nameToken = tokenizer.Next();
[1221]385        if (nameToken.type != TokenTypeEnum.String)
[273]386          Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
[2]387
388        List<Token> tokens = new List<Token>();
[1221]389        Token valueToken;
390        while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) valueToken = tokenizer.Next();
391        valueToken = tokenizer.Next();
392        while (valueToken != Tokenizer.NewlineToken) {
[2]393          tokens.Add(valueToken);
[1221]394          while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) tokenizer.Next();
[2]395          valueToken = tokenizer.Next();
396        }
397
398        metadata[nameToken.stringValue] = tokens;
399      }
400    }
401
402    private void Expect(Token expectedToken) {
403      Token actualToken = tokenizer.Next();
[1221]404      if (actualToken != expectedToken) {
[273]405        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]406      }
407    }
408
[273]409    private void Error(string message, string token, int lineNumber) {
410      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]411    }
412    #endregion
413  }
414}
Note: See TracBrowser for help on using the repository browser.