Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs @ 6406

Last change on this file since 6406 was 5445, checked in by swagner, 14 years ago

Updated year of copyrights (#1406)

File size: 13.5 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
[5445]3 * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
[2446]26using System.Linq;
27using System.Text;
[2]28
[3373]29namespace HeuristicLab.Problems.DataAnalysis {
[5013]30  public class TableFileParser {
31    private const int BUFFER_SIZE = 1024;
32    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
[2]33    private Tokenizer tokenizer;
[3264]34    private List<List<double>> rowValues;
[2]35
36    private int rows;
37    public int Rows {
38      get { return rows; }
39      set { rows = value; }
40    }
41
42    private int columns;
43    public int Columns {
44      get { return columns; }
45      set { columns = value; }
46    }
47
[3264]48    private double[,] values;
49    public double[,] Values {
[2]50      get {
[3264]51        return values;
[2]52      }
53    }
54
[5369]55    private List<string> variableNames;
[3264]56    public IEnumerable<string> VariableNames {
[2]57      get {
[3264]58        if (variableNames.Count > 0) return variableNames;
59        else {
[273]60          string[] names = new string[columns];
[1221]61          for (int i = 0; i < names.Length; i++) {
[273]62            names[i] = "X" + i.ToString("000");
63          }
64          return names;
[2]65        }
66      }
67    }
68
[5013]69    public TableFileParser() {
[3264]70      rowValues = new List<List<double>>();
71      variableNames = new List<string>();
[2]72    }
73
[3264]74    public void Parse(string fileName) {
[5013]75      NumberFormatInfo numberFormat;
76      char separator;
77      DetermineFileFormat(fileName, out numberFormat, out separator);
78      using (StreamReader reader = new StreamReader(fileName)) {
79        tokenizer = new Tokenizer(reader, numberFormat, separator);
80        // parse the file
81        Parse();
82      }
83
[2]84      // translate the list of samples into a DoubleMatrixData item
[3264]85      rows = rowValues.Count;
86      columns = rowValues[0].Count;
87      values = new double[rows, columns];
[2]88
[3264]89      int rowIndex = 0;
90      int columnIndex = 0;
91      foreach (List<double> row in rowValues) {
92        columnIndex = 0;
[1221]93        foreach (double element in row) {
[3264]94          values[rowIndex, columnIndex++] = element;
[2]95        }
[3264]96        rowIndex++;
[2]97      }
98    }
99
[5013]100    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
101      using (StreamReader reader = new StreamReader(fileName)) {
102        // skip first line
103        reader.ReadLine();
104        // read a block
105        char[] buffer = new char[BUFFER_SIZE];
106        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
107        // count frequency of special characters
108        Dictionary<char, int> charCounts = buffer.Take(charsRead)
109          .GroupBy(c => c)
110          .ToDictionary(g => g.Key, g => g.Count());
111
112        // depending on the characters occuring in the block
113        // we distinghish a number of different cases based on the the following rules:
114        // many points => it must be English number format, the other frequently occuring char is the separator
115        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
116        //   => check the line in more detail:
117        //            English: 0, 0, 0, 0
118        //            German:  0,0 0,0 0,0 ...
119        //            => if commas are followed by space => English format
120        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
121        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
122        if (OccurrencesOf(charCounts, '.') > 10) {
123          numberFormat = NumberFormatInfo.InvariantInfo;
124          separator = POSSIBLE_SEPARATORS
125            .Where(c => OccurrencesOf(charCounts, c) > 10)
126            .OrderBy(c => -OccurrencesOf(charCounts, c))
127            .DefaultIfEmpty(' ')
128            .First();
129        } else if (OccurrencesOf(charCounts, ',') > 10) {
130          // no points and many commas
131          int countCommaNonDigitPairs = 0;
132          for (int i = 0; i < charsRead - 1; i++) {
133            if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
134              countCommaNonDigitPairs++;
135            }
[1221]136          }
[5013]137          if (countCommaNonDigitPairs > 10) {
138            // English format (only integer values) with ',' as separator
139            numberFormat = NumberFormatInfo.InvariantInfo;
140            separator = ',';
141          } else {
142            char[] disallowedSeparators = new char[] { ',' };
143            // German format (real values)
[5096]144            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
[5013]145            separator = POSSIBLE_SEPARATORS
146              .Except(disallowedSeparators)
147              .Where(c => OccurrencesOf(charCounts, c) > 10)
148              .OrderBy(c => -OccurrencesOf(charCounts, c))
149              .DefaultIfEmpty(' ')
150              .First();
[405]151          }
[5013]152        } else {
153          // no points and no commas => English format
154          numberFormat = NumberFormatInfo.InvariantInfo;
155          separator = POSSIBLE_SEPARATORS
156            .Where(c => OccurrencesOf(charCounts, c) > 10)
157            .OrderBy(c => -OccurrencesOf(charCounts, c))
158            .DefaultIfEmpty(' ')
159            .First();
[405]160        }
161      }
162    }
163
[5013]164    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
165      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
166    }
167
[2]168    #region tokenizer
169    internal enum TokenTypeEnum {
[3264]170      NewLine, Separator, String, Double
[2]171    }
172
173    internal class Token {
174      public TokenTypeEnum type;
175      public string stringValue;
176      public double doubleValue;
177
178      public Token(TokenTypeEnum type, string value) {
179        this.type = type;
180        stringValue = value;
181        doubleValue = 0.0;
182      }
183
184      public override string ToString() {
185        return stringValue;
186      }
187    }
188
189
[3264]190    internal class Tokenizer {
[2]191      private StreamReader reader;
192      private List<Token> tokens;
[405]193      private NumberFormatInfo numberFormatInfo;
[5013]194      private char separator;
195      private const string INTERNAL_SEPARATOR = "#";
[2]196
[3264]197      private int currentLineNumber = 0;
198      public int CurrentLineNumber {
199        get { return currentLineNumber; }
200        private set { currentLineNumber = value; }
201      }
202      private string currentLine;
203      public string CurrentLine {
204        get { return currentLine; }
205        private set { currentLine = value; }
206      }
[2]207
[3264]208      private Token newlineToken;
209      public Token NewlineToken {
210        get { return newlineToken; }
211        private set { newlineToken = value; }
212      }
213      private Token separatorToken;
214      public Token SeparatorToken {
215        get { return separatorToken; }
216        private set { separatorToken = value; }
217      }
[2]218
[3264]219      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
[2]220        this.reader = reader;
[405]221        this.numberFormatInfo = numberFormatInfo;
[5013]222        this.separator = separator;
223        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
[3264]224        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]225        tokens = new List<Token>();
226        ReadNextTokens();
227      }
228
229      private void ReadNextTokens() {
[1221]230        if (!reader.EndOfStream) {
[2]231          CurrentLine = reader.ReadLine();
[2446]232          var newTokens = from str in Split(CurrentLine)
233                          let trimmedStr = str.Trim()
234                          where !string.IsNullOrEmpty(trimmedStr)
[5013]235                          select MakeToken(trimmedStr);
[2]236
[2446]237          tokens.AddRange(newTokens);
[2]238          tokens.Add(NewlineToken);
239          CurrentLineNumber++;
240        }
241      }
242
[2446]243      private IEnumerable<string> Split(string line) {
244        StringBuilder subStr = new StringBuilder();
245        foreach (char c in line) {
[5013]246          if (c == separator) {
[2446]247            yield return subStr.ToString();
248            subStr = new StringBuilder();
[5013]249            // all separator characters are transformed to the internally used separator character
250            yield return INTERNAL_SEPARATOR;
[2446]251          } else {
252            subStr.Append(c);
253          }
254        }
255        yield return subStr.ToString();
256      }
257
[2]258      private Token MakeToken(string strToken) {
[406]259        Token token = new Token(TokenTypeEnum.String, strToken);
[5013]260        if (strToken.Equals(INTERNAL_SEPARATOR)) {
[2446]261          return SeparatorToken;
[1221]262        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]263          token.type = TokenTypeEnum.Double;
264          return token;
[2]265        }
[2446]266
[406]267        // couldn't parse the token as an int or float number so return a string token
268        return token;
[2]269      }
270
271      public Token Peek() {
272        return tokens[0];
273      }
274
275      public Token Next() {
276        Token next = tokens[0];
277        tokens.RemoveAt(0);
[1221]278        if (tokens.Count == 0) {
[2]279          ReadNextTokens();
280        }
281        return next;
282      }
283
284      public bool HasNext() {
285        return tokens.Count > 0 || !reader.EndOfStream;
286      }
287    }
288    #endregion
289
290    #region parsing
[3264]291    private void Parse() {
292      ParseVariableNames();
[1221]293      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]294      ParseValues();
295      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]296    }
297
[3264]298    private void ParseValues() {
[1221]299      while (tokenizer.HasNext()) {
[2446]300        List<double> row = new List<double>();
[3264]301        row.Add(NextValue(tokenizer));
302        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
303          Expect(tokenizer.SeparatorToken);
304          row.Add(NextValue(tokenizer));
[2446]305        }
[3264]306        Expect(tokenizer.NewlineToken);
307        // all rows have to have the same number of values           
308        // the first row defines how many samples are needed
309        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
310          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
311            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2446]312        }
313        // add the current row to the collection of rows and start a new row
[3264]314        rowValues.Add(row);
[2446]315        row = new List<double>();
316      }
317    }
318
[3264]319    private double NextValue(Tokenizer tokenizer) {
320      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
[2446]321      Token current = tokenizer.Next();
[3264]322      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
[2446]323        return double.NaN;
324      } else if (current.type == TokenTypeEnum.Double) {
325        // just take the value
326        return current.doubleValue;
[2]327      }
[3264]328      // found an unexpected token => throw error
329      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
330      // this line is never executed because Error() throws an exception
331      throw new InvalidOperationException();
[2]332    }
333
[3264]334    private void ParseVariableNames() {
335      // if the first line doesn't start with a double value then we assume that the
336      // first line contains variable names
337      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
[2446]338
[2]339        List<Token> tokens = new List<Token>();
[1221]340        Token valueToken;
341        valueToken = tokenizer.Next();
[2446]342        tokens.Add(valueToken);
[3264]343        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
344          Expect(tokenizer.SeparatorToken);
[2]345          valueToken = tokenizer.Next();
[3264]346          if (valueToken != tokenizer.NewlineToken) {
[2446]347            tokens.Add(valueToken);
348          }
[2]349        }
[3264]350        if (valueToken != tokenizer.NewlineToken) {
351          Expect(tokenizer.NewlineToken);
[2446]352        }
[3264]353        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]354      }
355    }
356
357    private void Expect(Token expectedToken) {
358      Token actualToken = tokenizer.Next();
[1221]359      if (actualToken != expectedToken) {
[273]360        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]361      }
362    }
363
[273]364    private void Error(string message, string token, int lineNumber) {
365      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]366    }
367    #endregion
368  }
369}
Note: See TracBrowser for help on using the repository browser.