source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 9449

Last change on this file since 9449 was 9449, checked in by sforsten, 9 years ago

#2045: fixed described problems

File size: 18.7 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Globalization;
27using System.IO;
28using System.Linq;
29using System.Runtime.Serialization;
30using System.Text;
31
32namespace HeuristicLab.Problems.Instances.DataAnalysis {
33  public class TableFileParser {
34    private const int BUFFER_SIZE = 65536;
35    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', ' ' };
36    private Tokenizer tokenizer;
37    private List<List<object>> rowValues;
38
39    private int rows;
40    public int Rows {
41      get { return rows; }
42      set { rows = value; }
43    }
44
45    private int columns;
46    public int Columns {
47      get { return columns; }
48      set { columns = value; }
49    }
50
51    private List<IList> values;
52    public List<IList> Values {
53      get {
54        return values;
55      }
56    }
57
58    private List<string> variableNames;
59    public IEnumerable<string> VariableNames {
60      get {
61        if (variableNames.Count > 0) return variableNames;
62        else {
63          string[] names = new string[columns];
64          for (int i = 0; i < names.Length; i++) {
65            names[i] = "X" + i.ToString("000");
66          }
67          return names;
68        }
69      }
70    }
71
72    public TableFileParser() {
73      rowValues = new List<List<object>>();
74      variableNames = new List<string>();
75    }
76
77    /// <summary>
78    /// Parses a file and determines the format first
79    /// </summary>
80    /// <param name="fileName">file which is parsed</param>
81    public void Parse(string fileName) {
82      NumberFormatInfo numberFormat;
83      DateTimeFormatInfo dateTimeFormatInfo;
84      char separator;
85      DetermineFileFormat(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
86      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator);
87    }
88
89    /// <summary>
90    /// Parses a file with the given formats
91    /// </summary>
92    /// <param name="fileName">file which is parsed</param>
93    /// <param name="numberFormat">Format of numbers</param>
94    /// <param name="dateTimeFormatInfo">Format of datetime</param>
95    /// <param name="separator">defines the separator</param>
96    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
97      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator);
98    }
99
100    /// <summary>
101    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
102    /// </summary>
103    /// <param name="stream">stream which is parsed</param>
104    public void Parse(Stream stream) {
105      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
106      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
107      char separator = ',';
108      Parse(stream, numberFormat, dateTimeFormatInfo, separator);
109    }
110
111    /// <summary>
112    /// Parses a stream with the given formats.
113    /// </summary>
114    /// <param name="stream">Stream which is parsed</param>   
115    /// <param name="numberFormat">Format of numbers</param>
116    /// <param name="dateTimeFormatInfo">Format of datetime</param>
117    /// <param name="separator">defines the separator</param>
118    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
119      using (StreamReader reader = new StreamReader(stream)) {
120        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
121        // parse the file
122        Parse();
123      }
124
125      // translate the list of samples into a DoubleMatrixData item
126      rows = rowValues.Count;
127      columns = rowValues[0].Count;
128      values = new List<IList>();
129
130      //create columns
131      for (int col = 0; col < columns; col++) {
132        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
133        if (!types.Any()) {
134          values.Add(new List<string>());
135          continue;
136        }
137
138        var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
139        if (columnType == typeof(double)) values.Add(new List<double>());
140        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
141        else if (columnType == typeof(string)) values.Add(new List<string>());
142        else throw new InvalidOperationException();
143      }
144
145
146
147      //fill with values
148      foreach (List<object> row in rowValues) {
149        int columnIndex = 0;
150        foreach (object element in row) {
151          if (values[columnIndex] is List<double> && !(element is double))
152            values[columnIndex].Add(double.NaN);
153          else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
154            values[columnIndex].Add(DateTime.MinValue);
155          else if (values[columnIndex] is List<string> && !(element is string))
156            values[columnIndex].Add(string.Empty);
157          else
158            values[columnIndex].Add(element);
159          columnIndex++;
160        }
161      }
162    }
163
164    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
165      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
166    }
167
168    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
169      using (StreamReader reader = new StreamReader(stream)) {
170        // skip first line
171        reader.ReadLine();
172        // read a block
173        char[] buffer = new char[BUFFER_SIZE];
174        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
175        // count frequency of special characters
176        Dictionary<char, int> charCounts = buffer.Take(charsRead)
177          .GroupBy(c => c)
178          .ToDictionary(g => g.Key, g => g.Count());
179
180        // depending on the characters occuring in the block
181        // we distinghish a number of different cases based on the the following rules:
182        // many points => it must be English number format, the other frequently occuring char is the separator
183        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
184        //   => check the line in more detail:
185        //            English: 0, 0, 0, 0
186        //            German:  0,0 0,0 0,0 ...
187        //            => if commas are followed by space => English format
188        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
189        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
190        if (OccurrencesOf(charCounts, '.') > 10) {
191          numberFormat = NumberFormatInfo.InvariantInfo;
192          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
193          separator = POSSIBLE_SEPARATORS
194            .Where(c => OccurrencesOf(charCounts, c) > 10)
195            .OrderBy(c => -OccurrencesOf(charCounts, c))
196            .DefaultIfEmpty(' ')
197            .First();
198        } else if (OccurrencesOf(charCounts, ',') > 10) {
199          // no points and many commas
200          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
201          int tokensWithMultipleCommas = 0;
202          for (int i = 0; i < charsRead; i++) {
203            int nCommas = 0;
204            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
205              if (buffer[i] == ',') nCommas++;
206              i++;
207            }
208            if (nCommas > 2) tokensWithMultipleCommas++;
209          }
210          if (tokensWithMultipleCommas > 1) {
211            // English format (only integer values) with ',' as separator
212            numberFormat = NumberFormatInfo.InvariantInfo;
213            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
214            separator = ',';
215          } else {
216            char[] disallowedSeparators = new char[] { ',' };
217            // German format (real values)
218            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
219            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
220            separator = POSSIBLE_SEPARATORS
221              .Except(disallowedSeparators)
222              .Where(c => OccurrencesOf(charCounts, c) > 10)
223              .OrderBy(c => -OccurrencesOf(charCounts, c))
224              .DefaultIfEmpty(' ')
225              .First();
226          }
227        } else {
228          // no points and no commas => English format
229          numberFormat = NumberFormatInfo.InvariantInfo;
230          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
231          separator = POSSIBLE_SEPARATORS
232            .Where(c => OccurrencesOf(charCounts, c) > 10)
233            .OrderBy(c => -OccurrencesOf(charCounts, c))
234            .DefaultIfEmpty(' ')
235            .First();
236        }
237      }
238    }
239
240    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
241      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
242    }
243
244    #region tokenizer
245    internal enum TokenTypeEnum {
246      NewLine, Separator, String, Double, DateTime
247    }
248
249    internal class Token {
250      public TokenTypeEnum type;
251      public string stringValue;
252      public double doubleValue;
253      public DateTime dateTimeValue;
254
255      public Token(TokenTypeEnum type, string value) {
256        this.type = type;
257        stringValue = value;
258        dateTimeValue = DateTime.MinValue;
259        doubleValue = 0.0;
260      }
261
262      public override string ToString() {
263        return stringValue;
264      }
265    }
266
267
268    internal class Tokenizer {
269      private StreamReader reader;
270      private List<Token> tokens;
271      private NumberFormatInfo numberFormatInfo;
272      private DateTimeFormatInfo dateTimeFormatInfo;
273      private char separator;
274      private const string INTERNAL_SEPARATOR = "#";
275
276      private int currentLineNumber = 0;
277      public int CurrentLineNumber {
278        get { return currentLineNumber; }
279        private set { currentLineNumber = value; }
280      }
281      private string currentLine;
282      public string CurrentLine {
283        get { return currentLine; }
284        private set { currentLine = value; }
285      }
286
287      private Token newlineToken;
288      public Token NewlineToken {
289        get { return newlineToken; }
290        private set { newlineToken = value; }
291      }
292      private Token separatorToken;
293      public Token SeparatorToken {
294        get { return separatorToken; }
295        private set { separatorToken = value; }
296      }
297
298      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
299        this.reader = reader;
300        this.numberFormatInfo = numberFormatInfo;
301        this.dateTimeFormatInfo = dateTimeFormatInfo;
302        this.separator = separator;
303        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
304        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
305        tokens = new List<Token>();
306        ReadNextTokens();
307      }
308
309      private void ReadNextTokens() {
310        if (!reader.EndOfStream) {
311          CurrentLine = reader.ReadLine();
312          var newTokens = from str in Split(CurrentLine)
313                          let trimmedStr = str.Trim()
314                          where !string.IsNullOrEmpty(trimmedStr)
315                          select MakeToken(trimmedStr);
316
317          tokens.AddRange(newTokens);
318          tokens.Add(NewlineToken);
319          CurrentLineNumber++;
320        }
321      }
322
323      private IEnumerable<string> Split(string line) {
324        StringBuilder subStr = new StringBuilder();
325        foreach (char c in line) {
326          if (c == separator) {
327            yield return subStr.ToString();
328            subStr = new StringBuilder();
329            // all separator characters are transformed to the internally used separator character
330            yield return INTERNAL_SEPARATOR;
331          } else {
332            subStr.Append(c);
333          }
334        }
335        yield return subStr.ToString();
336      }
337
338      private Token MakeToken(string strToken) {
339        Token token = new Token(TokenTypeEnum.String, strToken);
340        if (strToken.Equals(INTERNAL_SEPARATOR)) {
341          return SeparatorToken;
342        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
343          token.type = TokenTypeEnum.Double;
344          return token;
345        } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
346          token.type = TokenTypeEnum.DateTime;
347          return token;
348        }
349
350        // couldn't parse the token as an int or float number  or datetime value so return a string token
351        return token;
352      }
353
354      public Token Peek() {
355        return tokens[0];
356      }
357
358      public Token Next() {
359        Token next = tokens[0];
360        tokens.RemoveAt(0);
361        if (tokens.Count == 0) {
362          ReadNextTokens();
363        }
364        return next;
365      }
366
367      public bool HasNext() {
368        return tokens.Count > 0 || !reader.EndOfStream;
369      }
370    }
371    #endregion
372
373    #region parsing
374    private void Parse() {
375      ParseVariableNames();
376      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
377      ParseValues();
378      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
379    }
380
381    private void ParseValues() {
382      while (tokenizer.HasNext()) {
383        if (tokenizer.Peek() == tokenizer.NewlineToken) {
384          tokenizer.Next();
385        } else {
386          List<object> row = new List<object>();
387          object value = NextValue(tokenizer);
388          row.Add(value);
389          while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
390            Expect(tokenizer.SeparatorToken);
391            row.Add(NextValue(tokenizer));
392          }
393          Expect(tokenizer.NewlineToken);
394          // all rows have to have the same number of values           
395          // the first row defines how many samples are needed
396          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
397            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
398                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
399                  tokenizer.CurrentLineNumber);
400          }
401          rowValues.Add(row);
402        }
403      }
404    }
405
406    private object NextValue(Tokenizer tokenizer) {
407      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
408      Token current = tokenizer.Next();
409      if (current.type == TokenTypeEnum.Separator) {
410        return double.NaN;
411      } else if (current.type == TokenTypeEnum.String) {
412        return current.stringValue;
413      } else if (current.type == TokenTypeEnum.Double) {
414        return current.doubleValue;
415      } else if (current.type == TokenTypeEnum.DateTime) {
416        return current.dateTimeValue;
417      }
418      // found an unexpected token => throw error
419      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
420      // this line is never executed because Error() throws an exception
421      throw new InvalidOperationException();
422    }
423
424    private void ParseVariableNames() {
425      //if first token is double no variables names are given
426      if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
427
428      // the first line must contain variable names
429      List<Token> tokens = new List<Token>();
430      Token valueToken;
431      valueToken = tokenizer.Next();
432      tokens.Add(valueToken);
433      while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
434        Expect(tokenizer.SeparatorToken);
435        valueToken = tokenizer.Next();
436        if (valueToken != tokenizer.NewlineToken) {
437          tokens.Add(valueToken);
438        }
439      }
440      if (valueToken != tokenizer.NewlineToken) {
441        Expect(tokenizer.NewlineToken);
442      }
443      variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
444    }
445
446    private void Expect(Token expectedToken) {
447      Token actualToken = tokenizer.Next();
448      if (actualToken != expectedToken) {
449        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
450      }
451    }
452
453    private void Error(string message, string token, int lineNumber) {
454      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
455    }
456    #endregion
457
458    [Serializable]
459    public class DataFormatException : Exception {
460      private int line;
461      public int Line {
462        get { return line; }
463      }
464      private string token;
465      public string Token {
466        get { return token; }
467      }
468      public DataFormatException(string message, string token, int line)
469        : base(message + "\nToken: " + token + " (line: " + line + ")") {
470        this.token = token;
471        this.line = line;
472      }
473
474      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
475    }
476  }
477}
Note: See TracBrowser for help on using the repository browser.