Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HLScript/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 10708

Last change on this file since 10708 was 9652, checked in by sforsten, 12 years ago

#2047: TableFileParser can now handle white spaces (currently the character '\0' symbolizes white spaces in the TableFileParser)

File size: 21.0 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Globalization;
27using System.IO;
28using System.Linq;
29using System.Runtime.Serialization;
30
31namespace HeuristicLab.Problems.Instances.DataAnalysis {
32  public class TableFileParser {
33    private const int BUFFER_SIZE = 65536;
34    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
35    private const char WHITESPACECHAR = (char)0;
36    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
37    private Tokenizer tokenizer;
38    private List<List<object>> rowValues;
39
40    private int rows;
41    public int Rows {
42      get { return rows; }
43      set { rows = value; }
44    }
45
46    private int columns;
47    public int Columns {
48      get { return columns; }
49      set { columns = value; }
50    }
51
52    private List<IList> values;
53    public List<IList> Values {
54      get {
55        return values;
56      }
57    }
58
59    private List<string> variableNames;
60    public IEnumerable<string> VariableNames {
61      get {
62        if (variableNames.Count > 0) return variableNames;
63        else {
64          string[] names = new string[columns];
65          for (int i = 0; i < names.Length; i++) {
66            names[i] = "X" + i.ToString("000");
67          }
68          return names;
69        }
70      }
71    }
72
73    public TableFileParser() {
74      rowValues = new List<List<object>>();
75      variableNames = new List<string>();
76    }
77
78    public bool AreColumnNamesInFirstLine(string fileName) {
79      NumberFormatInfo numberFormat;
80      DateTimeFormatInfo dateTimeFormatInfo;
81      char separator;
82      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
83      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
84        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
85      }
86    }
87
88    public bool AreColumnNamesInFirstLine(Stream stream) {
89      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
90      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
91      char separator = ',';
92      return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
93    }
94
95    public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
96                                         DateTimeFormatInfo dateTimeFormatInfo, char separator) {
97      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
98        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
99      }
100    }
101
102    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
103                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
104      using (StreamReader reader = new StreamReader(stream)) {
105        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
106        return tokenizer.Peek().type != TokenTypeEnum.Double;
107      }
108    }
109
110    /// <summary>
111    /// Parses a file and determines the format first
112    /// </summary>
113    /// <param name="fileName">file which is parsed</param>
114    /// <param name="columnNamesInFirstLine"></param>
115    public void Parse(string fileName, bool columnNamesInFirstLine) {
116      NumberFormatInfo numberFormat;
117      DateTimeFormatInfo dateTimeFormatInfo;
118      char separator;
119      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
120      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
121    }
122
123    /// <summary>
124    /// Parses a file with the given formats
125    /// </summary>
126    /// <param name="fileName">file which is parsed</param>
127    /// <param name="numberFormat">Format of numbers</param>
128    /// <param name="dateTimeFormatInfo">Format of datetime</param>
129    /// <param name="separator">defines the separator</param>
130    /// <param name="columnNamesInFirstLine"></param>
131    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
132      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
133        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
134      }
135    }
136
137    /// <summary>
138    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
139    /// </summary>
140    /// <param name="stream">stream which is parsed</param>
141    /// <param name="columnNamesInFirstLine"></param>
142    public void Parse(Stream stream, bool columnNamesInFirstLine) {
143      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
144      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
145      char separator = ',';
146      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
147    }
148
149    /// <summary>
150    /// Parses a stream with the given formats.
151    /// </summary>
152    /// <param name="stream">Stream which is parsed</param>   
153    /// <param name="numberFormat">Format of numbers</param>
154    /// <param name="dateTimeFormatInfo">Format of datetime</param>
155    /// <param name="separator">defines the separator</param>
156    /// <param name="columnNamesInFirstLine"></param>
157    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
158      using (StreamReader reader = new StreamReader(stream)) {
159        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
160        // parse the file
161        Parse(columnNamesInFirstLine);
162      }
163
164      // translate the list of samples into a DoubleMatrixData item
165      rows = rowValues.Count;
166      columns = rowValues[0].Count;
167      values = new List<IList>();
168
169      //create columns
170      for (int col = 0; col < columns; col++) {
171        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
172        if (!types.Any()) {
173          values.Add(new List<string>());
174          continue;
175        }
176
177        var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
178        if (columnType == typeof(double)) values.Add(new List<double>());
179        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
180        else if (columnType == typeof(string)) values.Add(new List<string>());
181        else throw new InvalidOperationException();
182      }
183
184
185
186      //fill with values
187      foreach (List<object> row in rowValues) {
188        int columnIndex = 0;
189        foreach (object element in row) {
190          if (values[columnIndex] is List<double> && !(element is double))
191            values[columnIndex].Add(double.NaN);
192          else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
193            values[columnIndex].Add(DateTime.MinValue);
194          else if (values[columnIndex] is List<string> && !(element is string))
195            values[columnIndex].Add(string.Empty);
196          else
197            values[columnIndex].Add(element);
198          columnIndex++;
199        }
200      }
201    }
202
203    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
204      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
205    }
206
207    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
208      using (StreamReader reader = new StreamReader(stream)) {
209        // skip first line
210        reader.ReadLine();
211        // read a block
212        char[] buffer = new char[BUFFER_SIZE];
213        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
214        // count frequency of special characters
215        Dictionary<char, int> charCounts = buffer.Take(charsRead)
216          .GroupBy(c => c)
217          .ToDictionary(g => g.Key, g => g.Count());
218
219        // depending on the characters occuring in the block
220        // we distinghish a number of different cases based on the the following rules:
221        // many points => it must be English number format, the other frequently occuring char is the separator
222        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
223        //   => check the line in more detail:
224        //            English: 0, 0, 0, 0
225        //            German:  0,0 0,0 0,0 ...
226        //            => if commas are followed by space => English format
227        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
228        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
229        if (OccurrencesOf(charCounts, '.') > 10) {
230          numberFormat = NumberFormatInfo.InvariantInfo;
231          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
232          separator = POSSIBLE_SEPARATORS
233            .Where(c => OccurrencesOf(charCounts, c) > 10)
234            .OrderBy(c => -OccurrencesOf(charCounts, c))
235            .DefaultIfEmpty(' ')
236            .First();
237        } else if (OccurrencesOf(charCounts, ',') > 10) {
238          // no points and many commas
239          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
240          int tokensWithMultipleCommas = 0;
241          for (int i = 0; i < charsRead; i++) {
242            int nCommas = 0;
243            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
244              if (buffer[i] == ',') nCommas++;
245              i++;
246            }
247            if (nCommas > 2) tokensWithMultipleCommas++;
248          }
249          if (tokensWithMultipleCommas > 1) {
250            // English format (only integer values) with ',' as separator
251            numberFormat = NumberFormatInfo.InvariantInfo;
252            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
253            separator = ',';
254          } else {
255            char[] disallowedSeparators = new char[] { ',' };
256            // German format (real values)
257            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
258            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
259            separator = POSSIBLE_SEPARATORS
260              .Except(disallowedSeparators)
261              .Where(c => OccurrencesOf(charCounts, c) > 10)
262              .OrderBy(c => -OccurrencesOf(charCounts, c))
263              .DefaultIfEmpty(' ')
264              .First();
265          }
266        } else {
267          // no points and no commas => English format
268          numberFormat = NumberFormatInfo.InvariantInfo;
269          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
270          separator = POSSIBLE_SEPARATORS
271            .Where(c => OccurrencesOf(charCounts, c) > 10)
272            .OrderBy(c => -OccurrencesOf(charCounts, c))
273            .DefaultIfEmpty(' ')
274            .First();
275        }
276      }
277    }
278
279    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
280      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
281    }
282
283    #region tokenizer
284    internal enum TokenTypeEnum {
285      NewLine, Separator, String, Double, DateTime
286    }
287
288    internal class Token {
289      public TokenTypeEnum type;
290      public string stringValue;
291      public double doubleValue;
292      public DateTime dateTimeValue;
293
294      public Token(TokenTypeEnum type, string value) {
295        this.type = type;
296        stringValue = value;
297        dateTimeValue = DateTime.MinValue;
298        doubleValue = 0.0;
299      }
300
301      public override string ToString() {
302        return stringValue;
303      }
304    }
305
306
307    internal class Tokenizer {
308      private StreamReader reader;
309      private List<Token> tokens;
310      private NumberFormatInfo numberFormatInfo;
311      private DateTimeFormatInfo dateTimeFormatInfo;
312      private char separator;
313      private const string INTERNAL_SEPARATOR = "#";
314
315      private int currentLineNumber = 0;
316      public int CurrentLineNumber {
317        get { return currentLineNumber; }
318        private set { currentLineNumber = value; }
319      }
320      private string currentLine;
321      public string CurrentLine {
322        get { return currentLine; }
323        private set { currentLine = value; }
324      }
325
326      private Token newlineToken;
327      public Token NewlineToken {
328        get { return newlineToken; }
329        private set { newlineToken = value; }
330      }
331      private Token separatorToken;
332      public Token SeparatorToken {
333        get { return separatorToken; }
334        private set { separatorToken = value; }
335      }
336
337      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
338        this.reader = reader;
339        this.numberFormatInfo = numberFormatInfo;
340        this.dateTimeFormatInfo = dateTimeFormatInfo;
341        this.separator = separator;
342        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
343        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
344        tokens = new List<Token>();
345        ReadNextTokens();
346      }
347
348      private void ReadNextTokens() {
349        if (!reader.EndOfStream) {
350          CurrentLine = reader.ReadLine();
351          var newTokens = from str in Split(CurrentLine)
352                          let trimmedStr = str.Trim()
353                          where !string.IsNullOrEmpty(trimmedStr)
354                          select MakeToken(trimmedStr);
355
356          tokens.AddRange(newTokens);
357          tokens.Add(NewlineToken);
358          CurrentLineNumber++;
359        }
360      }
361
362      private IEnumerable<string> Split(string line) {
363        IEnumerable<string> splitString;
364        if (separator == WHITESPACECHAR) {
365          //separate whitespaces
366          splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
367        } else {
368          splitString = line.Split(separator);
369        }
370        int cur = splitString.Count();
371        foreach (var str in splitString) {
372          yield return str;
373          cur--;
374          // do not return the INTERNAL_SEPARATOR after the last string
375          if (cur != 0) {
376            yield return INTERNAL_SEPARATOR;
377          }
378        }
379      }
380
381      private Token MakeToken(string strToken) {
382        Token token = new Token(TokenTypeEnum.String, strToken);
383        if (strToken.Equals(INTERNAL_SEPARATOR)) {
384          return SeparatorToken;
385        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
386          token.type = TokenTypeEnum.Double;
387          return token;
388        } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
389          token.type = TokenTypeEnum.DateTime;
390          return token;
391        }
392
393        // couldn't parse the token as an int or float number  or datetime value so return a string token
394        return token;
395      }
396
397      public Token Peek() {
398        return tokens[0];
399      }
400
401      public Token Next() {
402        Token next = tokens[0];
403        tokens.RemoveAt(0);
404        if (tokens.Count == 0) {
405          ReadNextTokens();
406        }
407        return next;
408      }
409
410      public bool HasNext() {
411        return tokens.Count > 0 || !reader.EndOfStream;
412      }
413    }
414    #endregion
415
416    #region parsing
417    private void Parse(bool columnNamesInFirstLine) {
418      if (columnNamesInFirstLine) {
419        ParseVariableNames();
420        if (!tokenizer.HasNext())
421          Error(
422            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
423            "", tokenizer.CurrentLineNumber);
424      }
425      ParseValues();
426      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
427    }
428
429    private void ParseValues() {
430      while (tokenizer.HasNext()) {
431        if (tokenizer.Peek() == tokenizer.NewlineToken) {
432          tokenizer.Next();
433        } else {
434          List<object> row = new List<object>();
435          object value = NextValue(tokenizer);
436          row.Add(value);
437          while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
438            Expect(tokenizer.SeparatorToken);
439            row.Add(NextValue(tokenizer));
440          }
441          Expect(tokenizer.NewlineToken);
442          // all rows have to have the same number of values           
443          // the first row defines how many samples are needed
444          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
445            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
446                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
447                  tokenizer.CurrentLineNumber);
448          }
449          rowValues.Add(row);
450        }
451      }
452    }
453
454    private object NextValue(Tokenizer tokenizer) {
455      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
456      Token current = tokenizer.Next();
457      if (current.type == TokenTypeEnum.Separator) {
458        return double.NaN;
459      } else if (current.type == TokenTypeEnum.String) {
460        return current.stringValue;
461      } else if (current.type == TokenTypeEnum.Double) {
462        return current.doubleValue;
463      } else if (current.type == TokenTypeEnum.DateTime) {
464        return current.dateTimeValue;
465      }
466      // found an unexpected token => throw error
467      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
468      // this line is never executed because Error() throws an exception
469      throw new InvalidOperationException();
470    }
471
472    private void ParseVariableNames() {
473      // the first line must contain variable names
474      List<Token> tokens = new List<Token>();
475      Token valueToken;
476      valueToken = tokenizer.Next();
477      tokens.Add(valueToken);
478      while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
479        Expect(tokenizer.SeparatorToken);
480        valueToken = tokenizer.Next();
481        if (valueToken != tokenizer.NewlineToken) {
482          tokens.Add(valueToken);
483        }
484      }
485      if (valueToken != tokenizer.NewlineToken) {
486        Expect(tokenizer.NewlineToken);
487      }
488      variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
489    }
490
491    private void Expect(Token expectedToken) {
492      Token actualToken = tokenizer.Next();
493      if (actualToken != expectedToken) {
494        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
495      }
496    }
497
498    private void Error(string message, string token, int lineNumber) {
499      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
500    }
501    #endregion
502
503    [Serializable]
504    public class DataFormatException : Exception {
505      private int line;
506      public int Line {
507        get { return line; }
508      }
509      private string token;
510      public string Token {
511        get { return token; }
512      }
513      public DataFormatException(string message, string token, int line)
514        : base(message + "\nToken: " + token + " (line: " + line + ")") {
515        this.token = token;
516        this.line = line;
517      }
518
519      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
520    }
521  }
522}
Note: See TracBrowser for help on using the repository browser.