Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataAnalysisCSVImport/3.3/TableFileParser.cs @ 8695

Last change on this file since 8695 was 8564, checked in by gkronber, 12 years ago

#1890

  • added an extension to calculate the range of IEnumerable<double>
  • increased the buffer size for the heuristic determination of separator characters in the table file parser (to make it work with files that have more than 1024 bytes in the second line).
File size: 18.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Globalization;
27using System.IO;
28using System.Linq;
29using System.Runtime.Serialization;
30using System.Text;
31
32namespace HeuristicLab.Problems.Instances.DataAnalysis {
33  public class TableFileParser {
34    private const int BUFFER_SIZE = 65536;
35    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
36    private Tokenizer tokenizer;
37    private List<List<object>> rowValues;
38
39    private int rows;
40    public int Rows {
41      get { return rows; }
42      set { rows = value; }
43    }
44
45    private int columns;
46    public int Columns {
47      get { return columns; }
48      set { columns = value; }
49    }
50
51    private List<IList> values;
52    public List<IList> Values {
53      get {
54        return values;
55      }
56    }
57
58    private List<string> variableNames;
59    public IEnumerable<string> VariableNames {
60      get {
61        if (variableNames.Count > 0) return variableNames;
62        else {
63          string[] names = new string[columns];
64          for (int i = 0; i < names.Length; i++) {
65            names[i] = "X" + i.ToString("000");
66          }
67          return names;
68        }
69      }
70    }
71
72    public TableFileParser() {
73      rowValues = new List<List<object>>();
74      variableNames = new List<string>();
75    }
76
77    /// <summary>
78    /// Parses a file and determines the format first
79    /// </summary>
80    /// <param name="fileName">file which is parsed</param>
81    public void Parse(string fileName) {
82      NumberFormatInfo numberFormat;
83      DateTimeFormatInfo dateTimeFormatInfo;
84      char separator;
85      DetermineFileFormat(new FileStream(fileName, FileMode.Open), out numberFormat, out dateTimeFormatInfo, out separator);
86      Parse(new FileStream(fileName, FileMode.Open), numberFormat, dateTimeFormatInfo, separator);
87    }
88
89    /// <summary>
90    /// Parses a file with the given formats
91    /// </summary>
92    /// <param name="fileName">file which is parsed</param>
93    /// <param name="numberFormat">Format of numbers</param>
94    /// <param name="dateTimeFormatInfo">Format of datetime</param>
95    /// <param name="separator">defines the separator</param>
96    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
97      Parse(new FileStream(fileName, FileMode.Open), numberFormat, dateTimeFormatInfo, separator);
98    }
99
100    /// <summary>
101    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
102    /// </summary>
103    /// <param name="stream">stream which is parsed</param>
104    public void Parse(Stream stream) {
105      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
106      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
107      char separator = ',';
108      Parse(stream, numberFormat, dateTimeFormatInfo, separator);
109    }
110
111    /// <summary>
112    /// Parses a stream with the given formats.
113    /// </summary>
114    /// <param name="stream">Stream which is parsed</param>   
115    /// <param name="numberFormat">Format of numbers</param>
116    /// <param name="dateTimeFormatInfo">Format of datetime</param>
117    /// <param name="separator">defines the separator</param>
118    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
119      using (StreamReader reader = new StreamReader(stream)) {
120        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
121        // parse the file
122        Parse();
123      }
124
125      // translate the list of samples into a DoubleMatrixData item
126      rows = rowValues.Count;
127      columns = rowValues[0].Count;
128      values = new List<IList>();
129
130      //create columns
131      for (int col = 0; col < columns; col++) {
132        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
133        if (!types.Any()) {
134          values.Add(new List<string>());
135          continue;
136        }
137
138        var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
139        if (columnType == typeof(double)) values.Add(new List<double>());
140        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
141        else if (columnType == typeof(string)) values.Add(new List<string>());
142        else throw new InvalidOperationException();
143      }
144
145
146
147      //fill with values
148      foreach (List<object> row in rowValues) {
149        int columnIndex = 0;
150        foreach (object element in row) {
151          if (values[columnIndex] is List<double> && !(element is double))
152            values[columnIndex].Add(double.NaN);
153          else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
154            values[columnIndex].Add(DateTime.MinValue);
155          else if (values[columnIndex] is List<string> && !(element is string))
156            values[columnIndex].Add(string.Empty);
157          else
158            values[columnIndex].Add(element);
159          columnIndex++;
160        }
161      }
162    }
163
164    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
165      DetermineFileFormat(new FileStream(path, FileMode.Open), out numberFormat, out dateTimeFormatInfo, out separator);
166    }
167
168    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
169      using (StreamReader reader = new StreamReader(stream)) {
170        // skip first line
171        reader.ReadLine();
172        // read a block
173        char[] buffer = new char[BUFFER_SIZE];
174        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
175        // count frequency of special characters
176        Dictionary<char, int> charCounts = buffer.Take(charsRead)
177          .GroupBy(c => c)
178          .ToDictionary(g => g.Key, g => g.Count());
179
180        // depending on the characters occuring in the block
181        // we distinghish a number of different cases based on the the following rules:
182        // many points => it must be English number format, the other frequently occuring char is the separator
183        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
184        //   => check the line in more detail:
185        //            English: 0, 0, 0, 0
186        //            German:  0,0 0,0 0,0 ...
187        //            => if commas are followed by space => English format
188        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
189        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
190        if (OccurrencesOf(charCounts, '.') > 10) {
191          numberFormat = NumberFormatInfo.InvariantInfo;
192          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
193          separator = POSSIBLE_SEPARATORS
194            .Where(c => OccurrencesOf(charCounts, c) > 10)
195            .OrderBy(c => -OccurrencesOf(charCounts, c))
196            .DefaultIfEmpty(' ')
197            .First();
198        } else if (OccurrencesOf(charCounts, ',') > 10) {
199          // no points and many commas
200          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
201          int tokensWithMultipleCommas = 0;
202          for (int i = 0; i < charsRead; i++) {
203            int nCommas = 0;
204            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
205              if (buffer[i] == ',') nCommas++;
206              i++;
207            }
208            if (nCommas > 2) tokensWithMultipleCommas++;
209          }
210          if (tokensWithMultipleCommas > 1) {
211            // English format (only integer values) with ',' as separator
212            numberFormat = NumberFormatInfo.InvariantInfo;
213            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
214            separator = ',';
215          } else {
216            char[] disallowedSeparators = new char[] { ',' };
217            // German format (real values)
218            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
219            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
220            separator = POSSIBLE_SEPARATORS
221              .Except(disallowedSeparators)
222              .Where(c => OccurrencesOf(charCounts, c) > 10)
223              .OrderBy(c => -OccurrencesOf(charCounts, c))
224              .DefaultIfEmpty(' ')
225              .First();
226          }
227        } else {
228          // no points and no commas => English format
229          numberFormat = NumberFormatInfo.InvariantInfo;
230          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
231          separator = POSSIBLE_SEPARATORS
232            .Where(c => OccurrencesOf(charCounts, c) > 10)
233            .OrderBy(c => -OccurrencesOf(charCounts, c))
234            .DefaultIfEmpty(' ')
235            .First();
236        }
237      }
238    }
239
240    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
241      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
242    }
243
244    #region tokenizer
245    internal enum TokenTypeEnum {
246      NewLine, Separator, String, Double, DateTime
247    }
248
249    internal class Token {
250      public TokenTypeEnum type;
251      public string stringValue;
252      public double doubleValue;
253      public DateTime dateTimeValue;
254
255      public Token(TokenTypeEnum type, string value) {
256        this.type = type;
257        stringValue = value;
258        dateTimeValue = DateTime.MinValue;
259        doubleValue = 0.0;
260      }
261
262      public override string ToString() {
263        return stringValue;
264      }
265    }
266
267
268    internal class Tokenizer {
269      private StreamReader reader;
270      private List<Token> tokens;
271      private NumberFormatInfo numberFormatInfo;
272      private DateTimeFormatInfo dateTimeFormatInfo;
273      private char separator;
274      private const string INTERNAL_SEPARATOR = "#";
275
276      private int currentLineNumber = 0;
277      public int CurrentLineNumber {
278        get { return currentLineNumber; }
279        private set { currentLineNumber = value; }
280      }
281      private string currentLine;
282      public string CurrentLine {
283        get { return currentLine; }
284        private set { currentLine = value; }
285      }
286
287      private Token newlineToken;
288      public Token NewlineToken {
289        get { return newlineToken; }
290        private set { newlineToken = value; }
291      }
292      private Token separatorToken;
293      public Token SeparatorToken {
294        get { return separatorToken; }
295        private set { separatorToken = value; }
296      }
297
298      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
299        this.reader = reader;
300        this.numberFormatInfo = numberFormatInfo;
301        this.dateTimeFormatInfo = dateTimeFormatInfo;
302        this.separator = separator;
303        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
304        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
305        tokens = new List<Token>();
306        ReadNextTokens();
307      }
308
309      private void ReadNextTokens() {
310        if (!reader.EndOfStream) {
311          CurrentLine = reader.ReadLine();
312          var newTokens = from str in Split(CurrentLine)
313                          let trimmedStr = str.Trim()
314                          where !string.IsNullOrEmpty(trimmedStr)
315                          select MakeToken(trimmedStr);
316
317          tokens.AddRange(newTokens);
318          tokens.Add(NewlineToken);
319          CurrentLineNumber++;
320        }
321      }
322
323      private IEnumerable<string> Split(string line) {
324        StringBuilder subStr = new StringBuilder();
325        foreach (char c in line) {
326          if (c == separator) {
327            yield return subStr.ToString();
328            subStr = new StringBuilder();
329            // all separator characters are transformed to the internally used separator character
330            yield return INTERNAL_SEPARATOR;
331          } else {
332            subStr.Append(c);
333          }
334        }
335        yield return subStr.ToString();
336      }
337
338      private Token MakeToken(string strToken) {
339        Token token = new Token(TokenTypeEnum.String, strToken);
340        if (strToken.Equals(INTERNAL_SEPARATOR)) {
341          return SeparatorToken;
342        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
343          token.type = TokenTypeEnum.Double;
344          return token;
345        } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
346          token.type = TokenTypeEnum.DateTime;
347          return token;
348        }
349
350        // couldn't parse the token as an int or float number  or datetime value so return a string token
351        return token;
352      }
353
354      public Token Peek() {
355        return tokens[0];
356      }
357
358      public Token Next() {
359        Token next = tokens[0];
360        tokens.RemoveAt(0);
361        if (tokens.Count == 0) {
362          ReadNextTokens();
363        }
364        return next;
365      }
366
367      public bool HasNext() {
368        return tokens.Count > 0 || !reader.EndOfStream;
369      }
370    }
371    #endregion
372
373    #region parsing
374    private void Parse() {
375      ParseVariableNames();
376      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
377      ParseValues();
378      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
379    }
380
381    private void ParseValues() {
382      while (tokenizer.HasNext()) {
383        if (tokenizer.Peek() == tokenizer.NewlineToken) {
384          tokenizer.Next();
385        } else {
386          List<object> row = new List<object>();
387          object value = NextValue(tokenizer);
388          row.Add(value);
389          while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
390            Expect(tokenizer.SeparatorToken);
391            row.Add(NextValue(tokenizer));
392          }
393          Expect(tokenizer.NewlineToken);
394          // all rows have to have the same number of values           
395          // the first row defines how many samples are needed
396          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
397            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
398                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
399                  tokenizer.CurrentLineNumber);
400          }
401          rowValues.Add(row);
402        }
403      }
404    }
405
406    private object NextValue(Tokenizer tokenizer) {
407      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
408      Token current = tokenizer.Next();
409      if (current.type == TokenTypeEnum.Separator) {
410        return double.NaN;
411      } else if (current.type == TokenTypeEnum.String) {
412        return current.stringValue;
413      } else if (current.type == TokenTypeEnum.Double) {
414        return current.doubleValue;
415      } else if (current.type == TokenTypeEnum.DateTime) {
416        return current.dateTimeValue;
417      }
418      // found an unexpected token => throw error
419      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
420      // this line is never executed because Error() throws an exception
421      throw new InvalidOperationException();
422    }
423
424    private void ParseVariableNames() {
425      //if first token is double no variables names are given
426      if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
427
428      // the first line must contain variable names
429      List<Token> tokens = new List<Token>();
430      Token valueToken;
431      valueToken = tokenizer.Next();
432      tokens.Add(valueToken);
433      while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
434        Expect(tokenizer.SeparatorToken);
435        valueToken = tokenizer.Next();
436        if (valueToken != tokenizer.NewlineToken) {
437          tokens.Add(valueToken);
438        }
439      }
440      if (valueToken != tokenizer.NewlineToken) {
441        Expect(tokenizer.NewlineToken);
442      }
443      variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
444    }
445
446    private void Expect(Token expectedToken) {
447      Token actualToken = tokenizer.Next();
448      if (actualToken != expectedToken) {
449        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
450      }
451    }
452
453    private void Error(string message, string token, int lineNumber) {
454      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
455    }
456    #endregion
457
458    [Serializable]
459    private class DataFormatException : Exception {
460      private int line;
461      public int Line {
462        get { return line; }
463      }
464      private string token;
465      public string Token {
466        get { return token; }
467      }
468      public DataFormatException(string message, string token, int line)
469        : base(message + "\nToken: " + token + " (line: " + line + ")") {
470        this.token = token;
471        this.line = line;
472      }
473
474      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
475    }
476  }
477}
Note: See TracBrowser for help on using the repository browser.