Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs @ 5046

Last change on this file since 5046 was 5013, checked in by gkronber, 14 years ago

Implemented heuristic to determine format for import of data tables and test cases. #1173

File size: 13.6 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
[3264]3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
[2446]26using System.Linq;
27using System.Text;
[2]28
[3373]29namespace HeuristicLab.Problems.DataAnalysis {
[5013]30  public class TableFileParser {
31    private const int BUFFER_SIZE = 1024;
32    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
[273]33    private const string VARIABLENAMES = "VARIABLENAMES";
[2]34    private Tokenizer tokenizer;
[3264]35    private List<string> variableNames;
36    private List<List<double>> rowValues;
[2]37
38    private int rows;
39    public int Rows {
40      get { return rows; }
41      set { rows = value; }
42    }
43
44    private int columns;
45    public int Columns {
46      get { return columns; }
47      set { columns = value; }
48    }
49
[3264]50    private double[,] values;
51    public double[,] Values {
[2]52      get {
[3264]53        return values;
[2]54      }
55    }
56
[3264]57    public IEnumerable<string> VariableNames {
[2]58      get {
[3264]59        if (variableNames.Count > 0) return variableNames;
60        else {
[273]61          string[] names = new string[columns];
[1221]62          for (int i = 0; i < names.Length; i++) {
[273]63            names[i] = "X" + i.ToString("000");
64          }
65          return names;
[2]66        }
67      }
68    }
69
[5013]70    public TableFileParser() {
[3264]71      rowValues = new List<List<double>>();
72      variableNames = new List<string>();
[2]73    }
74
[3264]75    private void Reset() {
76      variableNames.Clear();
77      rowValues.Clear();
[2]78    }
79
[3264]80    public void Parse(string fileName) {
[5013]81      NumberFormatInfo numberFormat;
82      char separator;
83      DetermineFileFormat(fileName, out numberFormat, out separator);
84      using (StreamReader reader = new StreamReader(fileName)) {
85        tokenizer = new Tokenizer(reader, numberFormat, separator);
86        // parse the file
87        Parse();
88      }
89
[2]90      // translate the list of samples into a DoubleMatrixData item
[3264]91      rows = rowValues.Count;
92      columns = rowValues[0].Count;
93      values = new double[rows, columns];
[2]94
[3264]95      int rowIndex = 0;
96      int columnIndex = 0;
97      foreach (List<double> row in rowValues) {
98        columnIndex = 0;
[1221]99        foreach (double element in row) {
[3264]100          values[rowIndex, columnIndex++] = element;
[2]101        }
[3264]102        rowIndex++;
[2]103      }
104    }
105
[5013]106    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
107      using (StreamReader reader = new StreamReader(fileName)) {
108        // skip first line
109        reader.ReadLine();
110        // read a block
111        char[] buffer = new char[BUFFER_SIZE];
112        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
113        // count frequency of special characters
114        Dictionary<char, int> charCounts = buffer.Take(charsRead)
115          .GroupBy(c => c)
116          .ToDictionary(g => g.Key, g => g.Count());
117
118        // depending on the characters occuring in the block
119        // we distinghish a number of different cases based on the the following rules:
120        // many points => it must be English number format, the other frequently occuring char is the separator
121        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
122        //   => check the line in more detail:
123        //            English: 0, 0, 0, 0
124        //            German:  0,0 0,0 0,0 ...
125        //            => if commas are followed by space => English format
126        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
127        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
128        if (OccurrencesOf(charCounts, '.') > 10) {
129          numberFormat = NumberFormatInfo.InvariantInfo;
130          separator = POSSIBLE_SEPARATORS
131            .Where(c => OccurrencesOf(charCounts, c) > 10)
132            .OrderBy(c => -OccurrencesOf(charCounts, c))
133            .DefaultIfEmpty(' ')
134            .First();
135        } else if (OccurrencesOf(charCounts, ',') > 10) {
136          // no points and many commas
137          int countCommaNonDigitPairs = 0;
138          for (int i = 0; i < charsRead - 1; i++) {
139            if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
140              countCommaNonDigitPairs++;
141            }
[1221]142          }
[5013]143          if (countCommaNonDigitPairs > 10) {
144            // English format (only integer values) with ',' as separator
145            numberFormat = NumberFormatInfo.InvariantInfo;
146            separator = ',';
147          } else {
148            char[] disallowedSeparators = new char[] { ',' };
149            // German format (real values)
150            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de"));
151            separator = POSSIBLE_SEPARATORS
152              .Except(disallowedSeparators)
153              .Where(c => OccurrencesOf(charCounts, c) > 10)
154              .OrderBy(c => -OccurrencesOf(charCounts, c))
155              .DefaultIfEmpty(' ')
156              .First();
[405]157          }
[5013]158        } else {
159          // no points and no commas => English format
160          numberFormat = NumberFormatInfo.InvariantInfo;
161          separator = POSSIBLE_SEPARATORS
162            .Where(c => OccurrencesOf(charCounts, c) > 10)
163            .OrderBy(c => -OccurrencesOf(charCounts, c))
164            .DefaultIfEmpty(' ')
165            .First();
[405]166        }
167      }
168    }
169
[5013]170    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
171      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
172    }
173
[2]174    #region tokenizer
175    internal enum TokenTypeEnum {
[3264]176      NewLine, Separator, String, Double
[2]177    }
178
179    internal class Token {
180      public TokenTypeEnum type;
181      public string stringValue;
182      public double doubleValue;
183
184      public Token(TokenTypeEnum type, string value) {
185        this.type = type;
186        stringValue = value;
187        doubleValue = 0.0;
188      }
189
190      public override string ToString() {
191        return stringValue;
192      }
193    }
194
195
[3264]196    internal class Tokenizer {
[2]197      private StreamReader reader;
198      private List<Token> tokens;
[405]199      private NumberFormatInfo numberFormatInfo;
[5013]200      private char separator;
201      private const string INTERNAL_SEPARATOR = "#";
[2]202
[3264]203      private int currentLineNumber = 0;
204      public int CurrentLineNumber {
205        get { return currentLineNumber; }
206        private set { currentLineNumber = value; }
207      }
208      private string currentLine;
209      public string CurrentLine {
210        get { return currentLine; }
211        private set { currentLine = value; }
212      }
[2]213
[3264]214      private Token newlineToken;
215      public Token NewlineToken {
216        get { return newlineToken; }
217        private set { newlineToken = value; }
218      }
219      private Token separatorToken;
220      public Token SeparatorToken {
221        get { return separatorToken; }
222        private set { separatorToken = value; }
223      }
[2]224
[3264]225      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
[2]226        this.reader = reader;
[405]227        this.numberFormatInfo = numberFormatInfo;
[5013]228        this.separator = separator;
229        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
[3264]230        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]231        tokens = new List<Token>();
232        ReadNextTokens();
233      }
234
235      private void ReadNextTokens() {
[1221]236        if (!reader.EndOfStream) {
[2]237          CurrentLine = reader.ReadLine();
[2446]238          var newTokens = from str in Split(CurrentLine)
239                          let trimmedStr = str.Trim()
240                          where !string.IsNullOrEmpty(trimmedStr)
[5013]241                          select MakeToken(trimmedStr);
[2]242
[2446]243          tokens.AddRange(newTokens);
[2]244          tokens.Add(NewlineToken);
245          CurrentLineNumber++;
246        }
247      }
248
[2446]249      private IEnumerable<string> Split(string line) {
250        StringBuilder subStr = new StringBuilder();
251        foreach (char c in line) {
[5013]252          if (c == separator) {
[2446]253            yield return subStr.ToString();
254            subStr = new StringBuilder();
[5013]255            // all separator characters are transformed to the internally used separator character
256            yield return INTERNAL_SEPARATOR;
[2446]257          } else {
258            subStr.Append(c);
259          }
260        }
261        yield return subStr.ToString();
262      }
263
[2]264      private Token MakeToken(string strToken) {
[406]265        Token token = new Token(TokenTypeEnum.String, strToken);
[5013]266        if (strToken.Equals(INTERNAL_SEPARATOR)) {
[2446]267          return SeparatorToken;
[1221]268        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]269          token.type = TokenTypeEnum.Double;
270          return token;
[2]271        }
[2446]272
[406]273        // couldn't parse the token as an int or float number so return a string token
274        return token;
[2]275      }
276
277      public Token Peek() {
278        return tokens[0];
279      }
280
281      public Token Next() {
282        Token next = tokens[0];
283        tokens.RemoveAt(0);
[1221]284        if (tokens.Count == 0) {
[2]285          ReadNextTokens();
286        }
287        return next;
288      }
289
290      public bool HasNext() {
291        return tokens.Count > 0 || !reader.EndOfStream;
292      }
293    }
294    #endregion
295
296    #region parsing
[3264]297    private void Parse() {
298      ParseVariableNames();
[1221]299      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]300      ParseValues();
301      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]302    }
303
[3264]304    private void ParseValues() {
[1221]305      while (tokenizer.HasNext()) {
[2446]306        List<double> row = new List<double>();
[3264]307        row.Add(NextValue(tokenizer));
308        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
309          Expect(tokenizer.SeparatorToken);
310          row.Add(NextValue(tokenizer));
[2446]311        }
[3264]312        Expect(tokenizer.NewlineToken);
313        // all rows have to have the same number of values           
314        // the first row defines how many samples are needed
315        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
316          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
317            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2446]318        }
319        // add the current row to the collection of rows and start a new row
[3264]320        rowValues.Add(row);
[2446]321        row = new List<double>();
322      }
323    }
324
[3264]325    private double NextValue(Tokenizer tokenizer) {
326      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
[2446]327      Token current = tokenizer.Next();
[3264]328      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
[2446]329        return double.NaN;
330      } else if (current.type == TokenTypeEnum.Double) {
331        // just take the value
332        return current.doubleValue;
[2]333      }
[3264]334      // found an unexpected token => throw error
335      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
336      // this line is never executed because Error() throws an exception
337      throw new InvalidOperationException();
[2]338    }
339
[3264]340    private void ParseVariableNames() {
341      // if the first line doesn't start with a double value then we assume that the
342      // first line contains variable names
343      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
[2446]344
[2]345        List<Token> tokens = new List<Token>();
[1221]346        Token valueToken;
347        valueToken = tokenizer.Next();
[2446]348        tokens.Add(valueToken);
[3264]349        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
350          Expect(tokenizer.SeparatorToken);
[2]351          valueToken = tokenizer.Next();
[3264]352          if (valueToken != tokenizer.NewlineToken) {
[2446]353            tokens.Add(valueToken);
354          }
[2]355        }
[3264]356        if (valueToken != tokenizer.NewlineToken) {
357          Expect(tokenizer.NewlineToken);
[2446]358        }
[3264]359        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]360      }
361    }
362
363    private void Expect(Token expectedToken) {
364      Token actualToken = tokenizer.Next();
[1221]365      if (actualToken != expectedToken) {
[273]366        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]367      }
368    }
369
[273]370    private void Error(string message, string token, int lineNumber) {
371      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]372    }
373    #endregion
374  }
375}
Note: See TracBrowser for help on using the repository browser.