Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 6238

Last change on this file since 6238 was 5809, checked in by mkommend, 14 years ago

#1418: Reintegrated branch into trunk.

File size: 14.1 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
[5445]3 * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
[2446]26using System.Linq;
[5484]27using System.Runtime.Serialization;
[2446]28using System.Text;
[2]29
[3373]30namespace HeuristicLab.Problems.DataAnalysis {
[5013]31  public class TableFileParser {
32    private const int BUFFER_SIZE = 1024;
33    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
[2]34    private Tokenizer tokenizer;
[3264]35    private List<List<double>> rowValues;
[2]36
37    private int rows;
38    public int Rows {
39      get { return rows; }
40      set { rows = value; }
41    }
42
43    private int columns;
44    public int Columns {
45      get { return columns; }
46      set { columns = value; }
47    }
48
[3264]49    private double[,] values;
50    public double[,] Values {
[2]51      get {
[3264]52        return values;
[2]53      }
54    }
55
[5369]56    private List<string> variableNames;
[3264]57    public IEnumerable<string> VariableNames {
[2]58      get {
[3264]59        if (variableNames.Count > 0) return variableNames;
60        else {
[273]61          string[] names = new string[columns];
[1221]62          for (int i = 0; i < names.Length; i++) {
[273]63            names[i] = "X" + i.ToString("000");
64          }
65          return names;
[2]66        }
67      }
68    }
69
[5013]70    public TableFileParser() {
[3264]71      rowValues = new List<List<double>>();
72      variableNames = new List<string>();
[2]73    }
74
[3264]75    public void Parse(string fileName) {
[5013]76      NumberFormatInfo numberFormat;
77      char separator;
78      DetermineFileFormat(fileName, out numberFormat, out separator);
79      using (StreamReader reader = new StreamReader(fileName)) {
80        tokenizer = new Tokenizer(reader, numberFormat, separator);
81        // parse the file
82        Parse();
83      }
84
[2]85      // translate the list of samples into a DoubleMatrixData item
[3264]86      rows = rowValues.Count;
87      columns = rowValues[0].Count;
88      values = new double[rows, columns];
[2]89
[3264]90      int rowIndex = 0;
91      int columnIndex = 0;
92      foreach (List<double> row in rowValues) {
93        columnIndex = 0;
[1221]94        foreach (double element in row) {
[3264]95          values[rowIndex, columnIndex++] = element;
[2]96        }
[3264]97        rowIndex++;
[2]98      }
99    }
100
[5013]101    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
102      using (StreamReader reader = new StreamReader(fileName)) {
103        // skip first line
104        reader.ReadLine();
105        // read a block
106        char[] buffer = new char[BUFFER_SIZE];
107        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
108        // count frequency of special characters
109        Dictionary<char, int> charCounts = buffer.Take(charsRead)
110          .GroupBy(c => c)
111          .ToDictionary(g => g.Key, g => g.Count());
112
113        // depending on the characters occuring in the block
114        // we distinghish a number of different cases based on the the following rules:
115        // many points => it must be English number format, the other frequently occuring char is the separator
116        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
117        //   => check the line in more detail:
118        //            English: 0, 0, 0, 0
119        //            German:  0,0 0,0 0,0 ...
120        //            => if commas are followed by space => English format
121        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
122        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
123        if (OccurrencesOf(charCounts, '.') > 10) {
124          numberFormat = NumberFormatInfo.InvariantInfo;
125          separator = POSSIBLE_SEPARATORS
126            .Where(c => OccurrencesOf(charCounts, c) > 10)
127            .OrderBy(c => -OccurrencesOf(charCounts, c))
128            .DefaultIfEmpty(' ')
129            .First();
130        } else if (OccurrencesOf(charCounts, ',') > 10) {
131          // no points and many commas
132          int countCommaNonDigitPairs = 0;
133          for (int i = 0; i < charsRead - 1; i++) {
134            if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
135              countCommaNonDigitPairs++;
136            }
[1221]137          }
[5013]138          if (countCommaNonDigitPairs > 10) {
139            // English format (only integer values) with ',' as separator
140            numberFormat = NumberFormatInfo.InvariantInfo;
141            separator = ',';
142          } else {
143            char[] disallowedSeparators = new char[] { ',' };
144            // German format (real values)
[5096]145            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
[5013]146            separator = POSSIBLE_SEPARATORS
147              .Except(disallowedSeparators)
148              .Where(c => OccurrencesOf(charCounts, c) > 10)
149              .OrderBy(c => -OccurrencesOf(charCounts, c))
150              .DefaultIfEmpty(' ')
151              .First();
[405]152          }
[5013]153        } else {
154          // no points and no commas => English format
155          numberFormat = NumberFormatInfo.InvariantInfo;
156          separator = POSSIBLE_SEPARATORS
157            .Where(c => OccurrencesOf(charCounts, c) > 10)
158            .OrderBy(c => -OccurrencesOf(charCounts, c))
159            .DefaultIfEmpty(' ')
160            .First();
[405]161        }
162      }
163    }
164
[5013]165    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
166      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
167    }
168
[2]169    #region tokenizer
170    internal enum TokenTypeEnum {
[3264]171      NewLine, Separator, String, Double
[2]172    }
173
174    internal class Token {
175      public TokenTypeEnum type;
176      public string stringValue;
177      public double doubleValue;
178
179      public Token(TokenTypeEnum type, string value) {
180        this.type = type;
181        stringValue = value;
182        doubleValue = 0.0;
183      }
184
185      public override string ToString() {
186        return stringValue;
187      }
188    }
189
190
[3264]191    internal class Tokenizer {
[2]192      private StreamReader reader;
193      private List<Token> tokens;
[405]194      private NumberFormatInfo numberFormatInfo;
[5013]195      private char separator;
196      private const string INTERNAL_SEPARATOR = "#";
[2]197
[3264]198      private int currentLineNumber = 0;
199      public int CurrentLineNumber {
200        get { return currentLineNumber; }
201        private set { currentLineNumber = value; }
202      }
203      private string currentLine;
204      public string CurrentLine {
205        get { return currentLine; }
206        private set { currentLine = value; }
207      }
[2]208
[3264]209      private Token newlineToken;
210      public Token NewlineToken {
211        get { return newlineToken; }
212        private set { newlineToken = value; }
213      }
214      private Token separatorToken;
215      public Token SeparatorToken {
216        get { return separatorToken; }
217        private set { separatorToken = value; }
218      }
[2]219
[3264]220      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
[2]221        this.reader = reader;
[405]222        this.numberFormatInfo = numberFormatInfo;
[5013]223        this.separator = separator;
224        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
[3264]225        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]226        tokens = new List<Token>();
227        ReadNextTokens();
228      }
229
230      private void ReadNextTokens() {
[1221]231        if (!reader.EndOfStream) {
[2]232          CurrentLine = reader.ReadLine();
[2446]233          var newTokens = from str in Split(CurrentLine)
234                          let trimmedStr = str.Trim()
235                          where !string.IsNullOrEmpty(trimmedStr)
[5013]236                          select MakeToken(trimmedStr);
[2]237
[2446]238          tokens.AddRange(newTokens);
[2]239          tokens.Add(NewlineToken);
240          CurrentLineNumber++;
241        }
242      }
243
[2446]244      private IEnumerable<string> Split(string line) {
245        StringBuilder subStr = new StringBuilder();
246        foreach (char c in line) {
[5013]247          if (c == separator) {
[2446]248            yield return subStr.ToString();
249            subStr = new StringBuilder();
[5013]250            // all separator characters are transformed to the internally used separator character
251            yield return INTERNAL_SEPARATOR;
[2446]252          } else {
253            subStr.Append(c);
254          }
255        }
256        yield return subStr.ToString();
257      }
258
[2]259      private Token MakeToken(string strToken) {
[406]260        Token token = new Token(TokenTypeEnum.String, strToken);
[5013]261        if (strToken.Equals(INTERNAL_SEPARATOR)) {
[2446]262          return SeparatorToken;
[1221]263        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]264          token.type = TokenTypeEnum.Double;
265          return token;
[2]266        }
[2446]267
[406]268        // couldn't parse the token as an int or float number so return a string token
269        return token;
[2]270      }
271
272      public Token Peek() {
273        return tokens[0];
274      }
275
276      public Token Next() {
277        Token next = tokens[0];
278        tokens.RemoveAt(0);
[1221]279        if (tokens.Count == 0) {
[2]280          ReadNextTokens();
281        }
282        return next;
283      }
284
285      public bool HasNext() {
286        return tokens.Count > 0 || !reader.EndOfStream;
287      }
288    }
289    #endregion
290
291    #region parsing
[3264]292    private void Parse() {
293      ParseVariableNames();
[1221]294      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]295      ParseValues();
296      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]297    }
298
[3264]299    private void ParseValues() {
[1221]300      while (tokenizer.HasNext()) {
[2446]301        List<double> row = new List<double>();
[3264]302        row.Add(NextValue(tokenizer));
303        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
304          Expect(tokenizer.SeparatorToken);
305          row.Add(NextValue(tokenizer));
[2446]306        }
[3264]307        Expect(tokenizer.NewlineToken);
308        // all rows have to have the same number of values           
309        // the first row defines how many samples are needed
310        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
311          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
312            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2446]313        }
314        // add the current row to the collection of rows and start a new row
[3264]315        rowValues.Add(row);
[2446]316        row = new List<double>();
317      }
318    }
319
[3264]320    private double NextValue(Tokenizer tokenizer) {
321      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
[2446]322      Token current = tokenizer.Next();
[3264]323      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
[2446]324        return double.NaN;
325      } else if (current.type == TokenTypeEnum.Double) {
326        // just take the value
327        return current.doubleValue;
[2]328      }
[3264]329      // found an unexpected token => throw error
330      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
331      // this line is never executed because Error() throws an exception
332      throw new InvalidOperationException();
[2]333    }
334
[3264]335    private void ParseVariableNames() {
336      // if the first line doesn't start with a double value then we assume that the
337      // first line contains variable names
338      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
[2446]339
[2]340        List<Token> tokens = new List<Token>();
[1221]341        Token valueToken;
342        valueToken = tokenizer.Next();
[2446]343        tokens.Add(valueToken);
[3264]344        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
345          Expect(tokenizer.SeparatorToken);
[2]346          valueToken = tokenizer.Next();
[3264]347          if (valueToken != tokenizer.NewlineToken) {
[2446]348            tokens.Add(valueToken);
349          }
[2]350        }
[3264]351        if (valueToken != tokenizer.NewlineToken) {
352          Expect(tokenizer.NewlineToken);
[2446]353        }
[3264]354        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]355      }
356    }
357
358    private void Expect(Token expectedToken) {
359      Token actualToken = tokenizer.Next();
[1221]360      if (actualToken != expectedToken) {
[273]361        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]362      }
363    }
364
[273]365    private void Error(string message, string token, int lineNumber) {
366      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]367    }
368    #endregion
[5484]369
370    [Serializable]
371    private class DataFormatException : Exception {
372      private int line;
373      public int Line {
374        get { return line; }
375      }
376      private string token;
377      public string Token {
378        get { return token; }
379      }
380      public DataFormatException(string message, string token, int line)
381        : base(message + "\nToken: " + token + " (line: " + line + ")") {
382        this.token = token;
383        this.line = line;
384      }
385
386      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
387    }
[2]388  }
389}
Note: See TracBrowser for help on using the repository browser.