Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs @ 6349

Last change on this file since 6349 was 5445, checked in by swagner, 14 years ago

Updated year of copyrights (#1406)

File size: 13.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using System.Linq;
27using System.Text;
28
29namespace HeuristicLab.Problems.DataAnalysis {
30  public class TableFileParser {
31    private const int BUFFER_SIZE = 1024;
32    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
33    private Tokenizer tokenizer;
34    private List<List<double>> rowValues;
35
36    private int rows;
37    public int Rows {
38      get { return rows; }
39      set { rows = value; }
40    }
41
42    private int columns;
43    public int Columns {
44      get { return columns; }
45      set { columns = value; }
46    }
47
48    private double[,] values;
49    public double[,] Values {
50      get {
51        return values;
52      }
53    }
54
55    private List<string> variableNames;
56    public IEnumerable<string> VariableNames {
57      get {
58        if (variableNames.Count > 0) return variableNames;
59        else {
60          string[] names = new string[columns];
61          for (int i = 0; i < names.Length; i++) {
62            names[i] = "X" + i.ToString("000");
63          }
64          return names;
65        }
66      }
67    }
68
69    public TableFileParser() {
70      rowValues = new List<List<double>>();
71      variableNames = new List<string>();
72    }
73
74    public void Parse(string fileName) {
75      NumberFormatInfo numberFormat;
76      char separator;
77      DetermineFileFormat(fileName, out numberFormat, out separator);
78      using (StreamReader reader = new StreamReader(fileName)) {
79        tokenizer = new Tokenizer(reader, numberFormat, separator);
80        // parse the file
81        Parse();
82      }
83
84      // translate the list of samples into a DoubleMatrixData item
85      rows = rowValues.Count;
86      columns = rowValues[0].Count;
87      values = new double[rows, columns];
88
89      int rowIndex = 0;
90      int columnIndex = 0;
91      foreach (List<double> row in rowValues) {
92        columnIndex = 0;
93        foreach (double element in row) {
94          values[rowIndex, columnIndex++] = element;
95        }
96        rowIndex++;
97      }
98    }
99
100    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
101      using (StreamReader reader = new StreamReader(fileName)) {
102        // skip first line
103        reader.ReadLine();
104        // read a block
105        char[] buffer = new char[BUFFER_SIZE];
106        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
107        // count frequency of special characters
108        Dictionary<char, int> charCounts = buffer.Take(charsRead)
109          .GroupBy(c => c)
110          .ToDictionary(g => g.Key, g => g.Count());
111
112        // depending on the characters occuring in the block
113        // we distinghish a number of different cases based on the the following rules:
114        // many points => it must be English number format, the other frequently occuring char is the separator
115        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
116        //   => check the line in more detail:
117        //            English: 0, 0, 0, 0
118        //            German:  0,0 0,0 0,0 ...
119        //            => if commas are followed by space => English format
120        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
121        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
122        if (OccurrencesOf(charCounts, '.') > 10) {
123          numberFormat = NumberFormatInfo.InvariantInfo;
124          separator = POSSIBLE_SEPARATORS
125            .Where(c => OccurrencesOf(charCounts, c) > 10)
126            .OrderBy(c => -OccurrencesOf(charCounts, c))
127            .DefaultIfEmpty(' ')
128            .First();
129        } else if (OccurrencesOf(charCounts, ',') > 10) {
130          // no points and many commas
131          int countCommaNonDigitPairs = 0;
132          for (int i = 0; i < charsRead - 1; i++) {
133            if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
134              countCommaNonDigitPairs++;
135            }
136          }
137          if (countCommaNonDigitPairs > 10) {
138            // English format (only integer values) with ',' as separator
139            numberFormat = NumberFormatInfo.InvariantInfo;
140            separator = ',';
141          } else {
142            char[] disallowedSeparators = new char[] { ',' };
143            // German format (real values)
144            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
145            separator = POSSIBLE_SEPARATORS
146              .Except(disallowedSeparators)
147              .Where(c => OccurrencesOf(charCounts, c) > 10)
148              .OrderBy(c => -OccurrencesOf(charCounts, c))
149              .DefaultIfEmpty(' ')
150              .First();
151          }
152        } else {
153          // no points and no commas => English format
154          numberFormat = NumberFormatInfo.InvariantInfo;
155          separator = POSSIBLE_SEPARATORS
156            .Where(c => OccurrencesOf(charCounts, c) > 10)
157            .OrderBy(c => -OccurrencesOf(charCounts, c))
158            .DefaultIfEmpty(' ')
159            .First();
160        }
161      }
162    }
163
164    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
165      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
166    }
167
168    #region tokenizer
169    internal enum TokenTypeEnum {
170      NewLine, Separator, String, Double
171    }
172
173    internal class Token {
174      public TokenTypeEnum type;
175      public string stringValue;
176      public double doubleValue;
177
178      public Token(TokenTypeEnum type, string value) {
179        this.type = type;
180        stringValue = value;
181        doubleValue = 0.0;
182      }
183
184      public override string ToString() {
185        return stringValue;
186      }
187    }
188
189
190    internal class Tokenizer {
191      private StreamReader reader;
192      private List<Token> tokens;
193      private NumberFormatInfo numberFormatInfo;
194      private char separator;
195      private const string INTERNAL_SEPARATOR = "#";
196
197      private int currentLineNumber = 0;
198      public int CurrentLineNumber {
199        get { return currentLineNumber; }
200        private set { currentLineNumber = value; }
201      }
202      private string currentLine;
203      public string CurrentLine {
204        get { return currentLine; }
205        private set { currentLine = value; }
206      }
207
208      private Token newlineToken;
209      public Token NewlineToken {
210        get { return newlineToken; }
211        private set { newlineToken = value; }
212      }
213      private Token separatorToken;
214      public Token SeparatorToken {
215        get { return separatorToken; }
216        private set { separatorToken = value; }
217      }
218
219      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
220        this.reader = reader;
221        this.numberFormatInfo = numberFormatInfo;
222        this.separator = separator;
223        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
224        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
225        tokens = new List<Token>();
226        ReadNextTokens();
227      }
228
229      private void ReadNextTokens() {
230        if (!reader.EndOfStream) {
231          CurrentLine = reader.ReadLine();
232          var newTokens = from str in Split(CurrentLine)
233                          let trimmedStr = str.Trim()
234                          where !string.IsNullOrEmpty(trimmedStr)
235                          select MakeToken(trimmedStr);
236
237          tokens.AddRange(newTokens);
238          tokens.Add(NewlineToken);
239          CurrentLineNumber++;
240        }
241      }
242
243      private IEnumerable<string> Split(string line) {
244        StringBuilder subStr = new StringBuilder();
245        foreach (char c in line) {
246          if (c == separator) {
247            yield return subStr.ToString();
248            subStr = new StringBuilder();
249            // all separator characters are transformed to the internally used separator character
250            yield return INTERNAL_SEPARATOR;
251          } else {
252            subStr.Append(c);
253          }
254        }
255        yield return subStr.ToString();
256      }
257
258      private Token MakeToken(string strToken) {
259        Token token = new Token(TokenTypeEnum.String, strToken);
260        if (strToken.Equals(INTERNAL_SEPARATOR)) {
261          return SeparatorToken;
262        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
263          token.type = TokenTypeEnum.Double;
264          return token;
265        }
266
267        // couldn't parse the token as an int or float number so return a string token
268        return token;
269      }
270
271      public Token Peek() {
272        return tokens[0];
273      }
274
275      public Token Next() {
276        Token next = tokens[0];
277        tokens.RemoveAt(0);
278        if (tokens.Count == 0) {
279          ReadNextTokens();
280        }
281        return next;
282      }
283
284      public bool HasNext() {
285        return tokens.Count > 0 || !reader.EndOfStream;
286      }
287    }
288    #endregion
289
290    #region parsing
291    private void Parse() {
292      ParseVariableNames();
293      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
294      ParseValues();
295      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
296    }
297
298    private void ParseValues() {
299      while (tokenizer.HasNext()) {
300        List<double> row = new List<double>();
301        row.Add(NextValue(tokenizer));
302        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
303          Expect(tokenizer.SeparatorToken);
304          row.Add(NextValue(tokenizer));
305        }
306        Expect(tokenizer.NewlineToken);
307        // all rows have to have the same number of values           
308        // the first row defines how many samples are needed
309        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
310          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
311            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
312        }
313        // add the current row to the collection of rows and start a new row
314        rowValues.Add(row);
315        row = new List<double>();
316      }
317    }
318
319    private double NextValue(Tokenizer tokenizer) {
320      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
321      Token current = tokenizer.Next();
322      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
323        return double.NaN;
324      } else if (current.type == TokenTypeEnum.Double) {
325        // just take the value
326        return current.doubleValue;
327      }
328      // found an unexpected token => throw error
329      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
330      // this line is never executed because Error() throws an exception
331      throw new InvalidOperationException();
332    }
333
334    private void ParseVariableNames() {
335      // if the first line doesn't start with a double value then we assume that the
336      // first line contains variable names
337      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
338
339        List<Token> tokens = new List<Token>();
340        Token valueToken;
341        valueToken = tokenizer.Next();
342        tokens.Add(valueToken);
343        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
344          Expect(tokenizer.SeparatorToken);
345          valueToken = tokenizer.Next();
346          if (valueToken != tokenizer.NewlineToken) {
347            tokens.Add(valueToken);
348          }
349        }
350        if (valueToken != tokenizer.NewlineToken) {
351          Expect(tokenizer.NewlineToken);
352        }
353        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
354      }
355    }
356
357    private void Expect(Token expectedToken) {
358      Token actualToken = tokenizer.Next();
359      if (actualToken != expectedToken) {
360        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
361      }
362    }
363
364    private void Error(string message, string token, int lineNumber) {
365      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
366    }
367    #endregion
368  }
369}
Note: See TracBrowser for help on using the repository browser.