source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 13411

Last change on this file since 13411 was 13411, checked in by gkronber, 6 years ago

#2071 minor refactoring of TableFileParser (let's see if the unit tests pass..., today I'm daring...)

File size: 22.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Globalization;
27using System.IO;
28using System.Linq;
29using System.Runtime.Serialization;
30using System.Security.Policy;
31
32namespace HeuristicLab.Problems.Instances.DataAnalysis {
33  public class TableFileParser {
34    private const int BUFFER_SIZE = 65536;
35    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
36    private const char WHITESPACECHAR = (char)0;
37    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
38    private Tokenizer tokenizer;
39    private List<List<object>> rowValues;
40
41    private int rows;
42    public int Rows {
43      get { return rows; }
44      set { rows = value; }
45    }
46
47    private int columns;
48    public int Columns {
49      get { return columns; }
50      set { columns = value; }
51    }
52
53    private List<IList> values;
54    public List<IList> Values {
55      get {
56        return values;
57      }
58    }
59
60    private List<string> variableNames;
61    public IEnumerable<string> VariableNames {
62      get {
63        if (variableNames.Count > 0) return variableNames;
64        else {
65          string[] names = new string[columns];
66          for (int i = 0; i < names.Length; i++) {
67            names[i] = "X" + i.ToString("000");
68          }
69          return names;
70        }
71      }
72    }
73
74    public TableFileParser() {
75      rowValues = new List<List<object>>();
76      variableNames = new List<string>();
77    }
78
79    public bool AreColumnNamesInFirstLine(string fileName) {
80      NumberFormatInfo numberFormat;
81      DateTimeFormatInfo dateTimeFormatInfo;
82      char separator;
83      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
84      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
85        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
86      }
87    }
88
89    public bool AreColumnNamesInFirstLine(Stream stream) {
90      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
91      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
92      char separator = ',';
93      return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
94    }
95
96    public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
97                                         DateTimeFormatInfo dateTimeFormatInfo, char separator) {
98      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
99        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
100      }
101    }
102
103    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
104                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
105      using (StreamReader reader = new StreamReader(stream)) {
106        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
107        return tokenizer.PeekType() != TokenTypeEnum.Double;
108      }
109    }
110
111    /// <summary>
112    /// Parses a file and determines the format first
113    /// </summary>
114    /// <param name="fileName">file which is parsed</param>
115    /// <param name="columnNamesInFirstLine"></param>
116    public void Parse(string fileName, bool columnNamesInFirstLine) {
117      NumberFormatInfo numberFormat;
118      DateTimeFormatInfo dateTimeFormatInfo;
119      char separator;
120      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
121      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
122    }
123
124    /// <summary>
125    /// Parses a file with the given formats
126    /// </summary>
127    /// <param name="fileName">file which is parsed</param>
128    /// <param name="numberFormat">Format of numbers</param>
129    /// <param name="dateTimeFormatInfo">Format of datetime</param>
130    /// <param name="separator">defines the separator</param>
131    /// <param name="columnNamesInFirstLine"></param>
132    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
133      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
134        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
135      }
136    }
137
138    /// <summary>
139    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
140    /// </summary>
141    /// <param name="stream">stream which is parsed</param>
142    /// <param name="columnNamesInFirstLine"></param>
143    public void Parse(Stream stream, bool columnNamesInFirstLine) {
144      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
145      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
146      char separator = ',';
147      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
148    }
149
150    /// <summary>
151    /// Parses a stream with the given formats.
152    /// </summary>
153    /// <param name="stream">Stream which is parsed</param>   
154    /// <param name="numberFormat">Format of numbers</param>
155    /// <param name="dateTimeFormatInfo">Format of datetime</param>
156    /// <param name="separator">defines the separator</param>
157    /// <param name="columnNamesInFirstLine"></param>
158    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
159      using (StreamReader reader = new StreamReader(stream)) {
160        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
161        // parse the file
162        Parse(columnNamesInFirstLine);
163      }
164
165      // translate the list of samples into a DoubleMatrixData item
166      rows = rowValues.Count;
167      columns = rowValues[0].Count;
168      values = new List<IList>();
169
170      //create columns
171      for (int col = 0; col < columns; col++) {
172        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType());
173        if (!types.Any()) {
174          values.Add(new List<string>());
175          continue;
176        }
177
178        var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
179        if (columnType == typeof(double)) values.Add(new List<double>());
180        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
181        else if (columnType == typeof(string)) values.Add(new List<string>());
182        else throw new InvalidOperationException();
183      }
184
185
186
187      //fill with values
188      foreach (List<object> row in rowValues) {
189        int columnIndex = 0;
190        foreach (object element in row) {
191          if (values[columnIndex] is List<double> && !(element is double))
192            values[columnIndex].Add(double.NaN);
193          else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
194            values[columnIndex].Add(DateTime.MinValue);
195          else if (values[columnIndex] is List<string> && !(element is string))
196            values[columnIndex].Add(element.ToString());
197          else
198            values[columnIndex].Add(element);
199          columnIndex++;
200        }
201      }
202    }
203
204    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
205      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
206    }
207
208    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
209      using (StreamReader reader = new StreamReader(stream)) {
210        // skip first line
211        reader.ReadLine();
212        // read a block
213        char[] buffer = new char[BUFFER_SIZE];
214        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
215        // count frequency of special characters
216        Dictionary<char, int> charCounts = buffer.Take(charsRead)
217          .GroupBy(c => c)
218          .ToDictionary(g => g.Key, g => g.Count());
219
220        // depending on the characters occuring in the block
221        // we distinghish a number of different cases based on the the following rules:
222        // many points => it must be English number format, the other frequently occuring char is the separator
223        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
224        //   => check the line in more detail:
225        //            English: 0, 0, 0, 0
226        //            German:  0,0 0,0 0,0 ...
227        //            => if commas are followed by space => English format
228        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
229        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
230        if (OccurrencesOf(charCounts, '.') > 10) {
231          numberFormat = NumberFormatInfo.InvariantInfo;
232          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
233          separator = POSSIBLE_SEPARATORS
234            .Where(c => OccurrencesOf(charCounts, c) > 10)
235            .OrderBy(c => -OccurrencesOf(charCounts, c))
236            .DefaultIfEmpty(' ')
237            .First();
238        } else if (OccurrencesOf(charCounts, ',') > 10) {
239          // no points and many commas
240          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
241          int tokensWithMultipleCommas = 0;
242          for (int i = 0; i < charsRead; i++) {
243            int nCommas = 0;
244            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
245              if (buffer[i] == ',') nCommas++;
246              i++;
247            }
248            if (nCommas > 2) tokensWithMultipleCommas++;
249          }
250          if (tokensWithMultipleCommas > 1) {
251            // English format (only integer values) with ',' as separator
252            numberFormat = NumberFormatInfo.InvariantInfo;
253            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
254            separator = ',';
255          } else {
256            char[] disallowedSeparators = new char[] { ',' };
257            // German format (real values)
258            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
259            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
260            separator = POSSIBLE_SEPARATORS
261              .Except(disallowedSeparators)
262              .Where(c => OccurrencesOf(charCounts, c) > 10)
263              .OrderBy(c => -OccurrencesOf(charCounts, c))
264              .DefaultIfEmpty(' ')
265              .First();
266          }
267        } else {
268          // no points and no commas => English format
269          numberFormat = NumberFormatInfo.InvariantInfo;
270          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
271          separator = POSSIBLE_SEPARATORS
272            .Where(c => OccurrencesOf(charCounts, c) > 10)
273            .OrderBy(c => -OccurrencesOf(charCounts, c))
274            .DefaultIfEmpty(' ')
275            .First();
276        }
277      }
278    }
279
280    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
281      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
282    }
283
284    #region tokenizer
285    internal enum TokenTypeEnum {
286      NewLine, Separator, String, Double, DateTime
287    }
288
289
290    //internal class Token {
291    //  public TokenTypeEnum type;
292    //  public string stringValue;
293    //  public double doubleValue;
294    //  public DateTime dateTimeValue;
295    //
296    //  public Token(TokenTypeEnum type, string value) {
297    //    this.type = type;
298    //    stringValue = value;
299    //    dateTimeValue = DateTime.MinValue;
300    //    doubleValue = 0.0;
301    //  }
302    //
303    //  public bool Equals(Token other) {
304    //    throw new NotImplementedException();
305    //  }
306    //
307    //  public override string ToString() {
308    //    return stringValue;
309    //  }
310    //
311    //  public override bool Equals(object obj) {
312    //    return Equals(obj as Token);
313    //  }
314    //
315    //  public override int GetHashCode() {
316    //    throw new NotSupportedException();
317    //  }
318    //}
319
320
321    internal class Tokenizer {
322      private StreamReader reader;
323      // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
324      private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
325      private string[] stringVals = new string[1024];
326      private double[] doubleVals = new double[1024];
327      private DateTime[] dateTimeVals = new DateTime[1024];
328      private int tokenPos;
329      private int numTokens;
330      private NumberFormatInfo numberFormatInfo;
331      private DateTimeFormatInfo dateTimeFormatInfo;
332      private char separator;
333      private const string INTERNAL_SEPARATOR = "#";
334
335      private int currentLineNumber = 0;
336      public int CurrentLineNumber {
337        get { return currentLineNumber; }
338        private set { currentLineNumber = value; }
339      }
340      private string currentLine;
341      public string CurrentLine {
342        get { return currentLine; }
343        private set { currentLine = value; }
344      }
345
346      // private Token newlineToken;
347      // public Token NewlineToken {
348      //   get { return newlineToken; }
349      //   private set { newlineToken = value; }
350      // }
351      // private Token separatorToken;
352      // public Token SeparatorToken {
353      //   get { return separatorToken; }
354      //   private set { separatorToken = value; }
355      // }
356
357      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
358        this.reader = reader;
359        this.numberFormatInfo = numberFormatInfo;
360        this.dateTimeFormatInfo = dateTimeFormatInfo;
361        this.separator = separator;
362        //separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
363        //newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
364        ReadNextTokens();
365      }
366
367      private void ReadNextTokens() {
368        if (!reader.EndOfStream) {
369          CurrentLine = reader.ReadLine();
370          int i = 0;
371          foreach (var tok in Split(CurrentLine)) {
372            var trimmedStr = tok.Trim();
373            if (!string.IsNullOrEmpty(trimmedStr)) {
374              TokenTypeEnum type = TokenTypeEnum.String; // default
375              stringVals[i] = trimmedStr;
376              double doubleVal;
377              DateTime dateTimeValue;
378              if (trimmedStr.Equals(INTERNAL_SEPARATOR)) {
379                type = TokenTypeEnum.Separator;
380              } else if (double.TryParse(trimmedStr, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
381                type = TokenTypeEnum.Double;
382                doubleVals[i] = doubleVal;
383              } else if (DateTime.TryParse(trimmedStr, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
384                type = TokenTypeEnum.DateTime;
385                dateTimeVals[i] = dateTimeValue;
386              }
387
388              // couldn't parse the token as an int or float number  or datetime value so return a string token
389
390              tokenTypes[i] = type;
391              i++;
392
393              if (i >= tokenTypes.Length) {
394                // increase buffer size if necessary
395                IncreaseCapacity(ref tokenTypes);
396                IncreaseCapacity(ref doubleVals);
397                IncreaseCapacity(ref stringVals);
398                IncreaseCapacity(ref dateTimeVals);
399              }
400            }
401          }
402          tokenTypes[i] = TokenTypeEnum.NewLine;
403          numTokens = i + 1;
404          tokenPos = 0;
405        }
406      }
407
408      private static void IncreaseCapacity<T>(ref T[] arr) {
409        int n = (int)Math.Floor(arr.Length * 1.7); // guess
410        T[] arr2 = new T[n];
411        Array.Copy(arr, arr2, arr.Length);
412        arr = arr2;
413      }
414
415      private IEnumerable<string> Split(string line) {
416        string[] splitString;
417        if (separator == WHITESPACECHAR) {
418          //separate whitespaces
419          splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
420        } else {
421          splitString = line.Split(separator);
422        }
423
424        for (int i = 0; i < splitString.Length - 1; i++) {
425          yield return splitString[i];
426          yield return INTERNAL_SEPARATOR;
427        }
428        // do not return the INTERNAL_SEPARATOR after the last string
429        yield return splitString[splitString.Length - 1];
430      }
431
432      public TokenTypeEnum PeekType() {
433        return tokenTypes[tokenPos];
434      }
435
436      public void Skip() {
437        // simply skips one token without returning the result values
438        tokenPos++;
439        if (numTokens == tokenPos) {
440          ReadNextTokens();
441        }
442      }
443
444      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
445        type = tokenTypes[tokenPos];
446        strVal = stringVals[tokenPos];
447        dblVal = doubleVals[tokenPos];
448        dateTimeVal = dateTimeVals[tokenPos];
449
450        Skip();
451      }
452
453      public bool HasNext() {
454        return numTokens > tokenPos || !reader.EndOfStream;
455      }
456    }
457    #endregion
458
459    #region parsing
460    private void Parse(bool columnNamesInFirstLine) {
461      if (columnNamesInFirstLine) {
462        ParseVariableNames();
463        if (!tokenizer.HasNext())
464          Error(
465            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
466            "", tokenizer.CurrentLineNumber);
467      }
468      ParseValues();
469      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
470    }
471
472    private void ParseValues() {
473      while (tokenizer.HasNext()) {
474        if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
475          tokenizer.Skip();
476        } else {
477          List<object> row = new List<object>();
478          object value = NextValue(tokenizer);
479          row.Add(value);
480          while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
481            ExpectType(TokenTypeEnum.Separator);
482            row.Add(NextValue(tokenizer));
483          }
484          ExpectType(TokenTypeEnum.NewLine);
485          // all rows have to have the same number of values           
486          // the first row defines how many samples are needed
487          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
488            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
489                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
490                  tokenizer.CurrentLineNumber);
491          }
492          rowValues.Add(row);
493        }
494      }
495    }
496
497    private object NextValue(Tokenizer tokenizer) {
498      if (tokenizer.PeekType() == TokenTypeEnum.Separator || tokenizer.PeekType() == TokenTypeEnum.NewLine) return string.Empty;
499      TokenTypeEnum type;
500      string strVal;
501      double dblVal;
502      DateTime dateTimeVal;
503
504      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
505      switch (type) {
506        case TokenTypeEnum.Separator: return double.NaN;
507        case TokenTypeEnum.String: return strVal;
508        case TokenTypeEnum.Double: return dblVal;
509        case TokenTypeEnum.DateTime: return dateTimeVal;
510      }
511      // found an unexpected token => throw error
512      Error("Unexpected token.", strVal, tokenizer.CurrentLineNumber);
513      // this line is never executed because Error() throws an exception
514      throw new InvalidOperationException();
515    }
516
517    private void ParseVariableNames() {
518      // the first line must contain variable names
519      List<string> varNames = new List<string>();
520
521      TokenTypeEnum type;
522      string strVal;
523      double dblVal;
524      DateTime dateTimeVal;
525
526      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
527
528      // the first token must be a variable name
529      if (type != TokenTypeEnum.String)
530        throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
531      varNames.Add(strVal);
532
533      while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
534        ExpectType(TokenTypeEnum.Separator);
535        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
536        varNames.Add(strVal);
537      }
538      ExpectType(TokenTypeEnum.NewLine);
539
540      variableNames = varNames;
541    }
542
543    private void ExpectType(TokenTypeEnum expectedToken) {
544      if (tokenizer.PeekType() != expectedToken)
545        throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
546      tokenizer.Skip();
547    }
548
549    private void Error(string message, string token, int lineNumber) {
550      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
551    }
552    #endregion
553
554    [Serializable]
555    public class DataFormatException : Exception {
556      private int line;
557      public int Line {
558        get { return line; }
559      }
560      private string token;
561      public string Token {
562        get { return token; }
563      }
564      public DataFormatException(string message, string token, int line)
565        : base(message + "\nToken: " + token + " (line: " + line + ")") {
566        this.token = token;
567        this.line = line;
568      }
569
570      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
571    }
572  }
573}
Note: See TracBrowser for help on using the repository browser.