Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 5809

Last change on this file since 5809 was 5809, checked in by mkommend, 13 years ago

#1418: Reintegrated branch into trunk.

File size: 14.1 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using System.Linq;
27using System.Runtime.Serialization;
28using System.Text;
29
30namespace HeuristicLab.Problems.DataAnalysis {
31  public class TableFileParser {
32    private const int BUFFER_SIZE = 1024;
33    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
34    private Tokenizer tokenizer;
35    private List<List<double>> rowValues;
36
37    private int rows;
38    public int Rows {
39      get { return rows; }
40      set { rows = value; }
41    }
42
43    private int columns;
44    public int Columns {
45      get { return columns; }
46      set { columns = value; }
47    }
48
49    private double[,] values;
50    public double[,] Values {
51      get {
52        return values;
53      }
54    }
55
56    private List<string> variableNames;
57    public IEnumerable<string> VariableNames {
58      get {
59        if (variableNames.Count > 0) return variableNames;
60        else {
61          string[] names = new string[columns];
62          for (int i = 0; i < names.Length; i++) {
63            names[i] = "X" + i.ToString("000");
64          }
65          return names;
66        }
67      }
68    }
69
70    public TableFileParser() {
71      rowValues = new List<List<double>>();
72      variableNames = new List<string>();
73    }
74
75    public void Parse(string fileName) {
76      NumberFormatInfo numberFormat;
77      char separator;
78      DetermineFileFormat(fileName, out numberFormat, out separator);
79      using (StreamReader reader = new StreamReader(fileName)) {
80        tokenizer = new Tokenizer(reader, numberFormat, separator);
81        // parse the file
82        Parse();
83      }
84
85      // translate the list of samples into a DoubleMatrixData item
86      rows = rowValues.Count;
87      columns = rowValues[0].Count;
88      values = new double[rows, columns];
89
90      int rowIndex = 0;
91      int columnIndex = 0;
92      foreach (List<double> row in rowValues) {
93        columnIndex = 0;
94        foreach (double element in row) {
95          values[rowIndex, columnIndex++] = element;
96        }
97        rowIndex++;
98      }
99    }
100
101    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
102      using (StreamReader reader = new StreamReader(fileName)) {
103        // skip first line
104        reader.ReadLine();
105        // read a block
106        char[] buffer = new char[BUFFER_SIZE];
107        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
108        // count frequency of special characters
109        Dictionary<char, int> charCounts = buffer.Take(charsRead)
110          .GroupBy(c => c)
111          .ToDictionary(g => g.Key, g => g.Count());
112
113        // depending on the characters occuring in the block
114        // we distinghish a number of different cases based on the the following rules:
115        // many points => it must be English number format, the other frequently occuring char is the separator
116        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
117        //   => check the line in more detail:
118        //            English: 0, 0, 0, 0
119        //            German:  0,0 0,0 0,0 ...
120        //            => if commas are followed by space => English format
121        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
122        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
123        if (OccurrencesOf(charCounts, '.') > 10) {
124          numberFormat = NumberFormatInfo.InvariantInfo;
125          separator = POSSIBLE_SEPARATORS
126            .Where(c => OccurrencesOf(charCounts, c) > 10)
127            .OrderBy(c => -OccurrencesOf(charCounts, c))
128            .DefaultIfEmpty(' ')
129            .First();
130        } else if (OccurrencesOf(charCounts, ',') > 10) {
131          // no points and many commas
132          int countCommaNonDigitPairs = 0;
133          for (int i = 0; i < charsRead - 1; i++) {
134            if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
135              countCommaNonDigitPairs++;
136            }
137          }
138          if (countCommaNonDigitPairs > 10) {
139            // English format (only integer values) with ',' as separator
140            numberFormat = NumberFormatInfo.InvariantInfo;
141            separator = ',';
142          } else {
143            char[] disallowedSeparators = new char[] { ',' };
144            // German format (real values)
145            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
146            separator = POSSIBLE_SEPARATORS
147              .Except(disallowedSeparators)
148              .Where(c => OccurrencesOf(charCounts, c) > 10)
149              .OrderBy(c => -OccurrencesOf(charCounts, c))
150              .DefaultIfEmpty(' ')
151              .First();
152          }
153        } else {
154          // no points and no commas => English format
155          numberFormat = NumberFormatInfo.InvariantInfo;
156          separator = POSSIBLE_SEPARATORS
157            .Where(c => OccurrencesOf(charCounts, c) > 10)
158            .OrderBy(c => -OccurrencesOf(charCounts, c))
159            .DefaultIfEmpty(' ')
160            .First();
161        }
162      }
163    }
164
165    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
166      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
167    }
168
169    #region tokenizer
170    internal enum TokenTypeEnum {
171      NewLine, Separator, String, Double
172    }
173
174    internal class Token {
175      public TokenTypeEnum type;
176      public string stringValue;
177      public double doubleValue;
178
179      public Token(TokenTypeEnum type, string value) {
180        this.type = type;
181        stringValue = value;
182        doubleValue = 0.0;
183      }
184
185      public override string ToString() {
186        return stringValue;
187      }
188    }
189
190
191    internal class Tokenizer {
192      private StreamReader reader;
193      private List<Token> tokens;
194      private NumberFormatInfo numberFormatInfo;
195      private char separator;
196      private const string INTERNAL_SEPARATOR = "#";
197
198      private int currentLineNumber = 0;
199      public int CurrentLineNumber {
200        get { return currentLineNumber; }
201        private set { currentLineNumber = value; }
202      }
203      private string currentLine;
204      public string CurrentLine {
205        get { return currentLine; }
206        private set { currentLine = value; }
207      }
208
209      private Token newlineToken;
210      public Token NewlineToken {
211        get { return newlineToken; }
212        private set { newlineToken = value; }
213      }
214      private Token separatorToken;
215      public Token SeparatorToken {
216        get { return separatorToken; }
217        private set { separatorToken = value; }
218      }
219
220      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
221        this.reader = reader;
222        this.numberFormatInfo = numberFormatInfo;
223        this.separator = separator;
224        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
225        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
226        tokens = new List<Token>();
227        ReadNextTokens();
228      }
229
230      private void ReadNextTokens() {
231        if (!reader.EndOfStream) {
232          CurrentLine = reader.ReadLine();
233          var newTokens = from str in Split(CurrentLine)
234                          let trimmedStr = str.Trim()
235                          where !string.IsNullOrEmpty(trimmedStr)
236                          select MakeToken(trimmedStr);
237
238          tokens.AddRange(newTokens);
239          tokens.Add(NewlineToken);
240          CurrentLineNumber++;
241        }
242      }
243
244      private IEnumerable<string> Split(string line) {
245        StringBuilder subStr = new StringBuilder();
246        foreach (char c in line) {
247          if (c == separator) {
248            yield return subStr.ToString();
249            subStr = new StringBuilder();
250            // all separator characters are transformed to the internally used separator character
251            yield return INTERNAL_SEPARATOR;
252          } else {
253            subStr.Append(c);
254          }
255        }
256        yield return subStr.ToString();
257      }
258
259      private Token MakeToken(string strToken) {
260        Token token = new Token(TokenTypeEnum.String, strToken);
261        if (strToken.Equals(INTERNAL_SEPARATOR)) {
262          return SeparatorToken;
263        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
264          token.type = TokenTypeEnum.Double;
265          return token;
266        }
267
268        // couldn't parse the token as an int or float number so return a string token
269        return token;
270      }
271
272      public Token Peek() {
273        return tokens[0];
274      }
275
276      public Token Next() {
277        Token next = tokens[0];
278        tokens.RemoveAt(0);
279        if (tokens.Count == 0) {
280          ReadNextTokens();
281        }
282        return next;
283      }
284
285      public bool HasNext() {
286        return tokens.Count > 0 || !reader.EndOfStream;
287      }
288    }
289    #endregion
290
291    #region parsing
292    private void Parse() {
293      ParseVariableNames();
294      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
295      ParseValues();
296      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
297    }
298
299    private void ParseValues() {
300      while (tokenizer.HasNext()) {
301        List<double> row = new List<double>();
302        row.Add(NextValue(tokenizer));
303        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
304          Expect(tokenizer.SeparatorToken);
305          row.Add(NextValue(tokenizer));
306        }
307        Expect(tokenizer.NewlineToken);
308        // all rows have to have the same number of values           
309        // the first row defines how many samples are needed
310        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
311          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
312            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
313        }
314        // add the current row to the collection of rows and start a new row
315        rowValues.Add(row);
316        row = new List<double>();
317      }
318    }
319
320    private double NextValue(Tokenizer tokenizer) {
321      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
322      Token current = tokenizer.Next();
323      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
324        return double.NaN;
325      } else if (current.type == TokenTypeEnum.Double) {
326        // just take the value
327        return current.doubleValue;
328      }
329      // found an unexpected token => throw error
330      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
331      // this line is never executed because Error() throws an exception
332      throw new InvalidOperationException();
333    }
334
335    private void ParseVariableNames() {
336      // if the first line doesn't start with a double value then we assume that the
337      // first line contains variable names
338      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
339
340        List<Token> tokens = new List<Token>();
341        Token valueToken;
342        valueToken = tokenizer.Next();
343        tokens.Add(valueToken);
344        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
345          Expect(tokenizer.SeparatorToken);
346          valueToken = tokenizer.Next();
347          if (valueToken != tokenizer.NewlineToken) {
348            tokens.Add(valueToken);
349          }
350        }
351        if (valueToken != tokenizer.NewlineToken) {
352          Expect(tokenizer.NewlineToken);
353        }
354        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
355      }
356    }
357
358    private void Expect(Token expectedToken) {
359      Token actualToken = tokenizer.Next();
360      if (actualToken != expectedToken) {
361        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
362      }
363    }
364
365    private void Error(string message, string token, int lineNumber) {
366      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
367    }
368    #endregion
369
370    [Serializable]
371    private class DataFormatException : Exception {
372      private int line;
373      public int Line {
374        get { return line; }
375      }
376      private string token;
377      public string Token {
378        get { return token; }
379      }
380      public DataFormatException(string message, string token, int line)
381        : base(message + "\nToken: " + token + " (line: " + line + ")") {
382        this.token = token;
383        this.line = line;
384      }
385
386      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
387    }
388  }
389}
Note: See TracBrowser for help on using the repository browser.