source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 13413

Last change on this file since 13413 was 13413, checked in by gkronber, 5 years ago

#2071: only preview first 500 lines of data in CSV import dialog.

File size: 21.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Globalization;
27using System.IO;
28using System.Linq;
29using System.Runtime.Serialization;
30
31namespace HeuristicLab.Problems.Instances.DataAnalysis {
32  public class TableFileParser {
33    private const int BUFFER_SIZE = 65536;
34    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
35    private const char WHITESPACECHAR = (char)0;
36    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
37    private Tokenizer tokenizer;
38    private List<List<object>> rowValues;
39
40    private int rows;
41    public int Rows {
42      get { return rows; }
43      set { rows = value; }
44    }
45
46    private int columns;
47    public int Columns {
48      get { return columns; }
49      set { columns = value; }
50    }
51
52    private List<IList> values;
53    public List<IList> Values {
54      get {
55        return values;
56      }
57    }
58
59    private List<string> variableNames;
60    public IEnumerable<string> VariableNames {
61      get {
62        if (variableNames.Count > 0) return variableNames;
63        else {
64          string[] names = new string[columns];
65          for (int i = 0; i < names.Length; i++) {
66            names[i] = "X" + i.ToString("000");
67          }
68          return names;
69        }
70      }
71    }
72
73    public TableFileParser() {
74      rowValues = new List<List<object>>();
75      variableNames = new List<string>();
76    }
77
78    public bool AreColumnNamesInFirstLine(string fileName) {
79      NumberFormatInfo numberFormat;
80      DateTimeFormatInfo dateTimeFormatInfo;
81      char separator;
82      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
83      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
84        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
85      }
86    }
87
88    public bool AreColumnNamesInFirstLine(Stream stream) {
89      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
90      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
91      char separator = ',';
92      return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
93    }
94
95    public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
96                                         DateTimeFormatInfo dateTimeFormatInfo, char separator) {
97      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
98        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
99      }
100    }
101
102    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
103                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
104      using (StreamReader reader = new StreamReader(stream)) {
105        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
106        return tokenizer.PeekType() != TokenTypeEnum.Double;
107      }
108    }
109
110    /// <summary>
111    /// Parses a file and determines the format first
112    /// </summary>
113    /// <param name="fileName">file which is parsed</param>
114    /// <param name="columnNamesInFirstLine"></param>
115    public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
116      NumberFormatInfo numberFormat;
117      DateTimeFormatInfo dateTimeFormatInfo;
118      char separator;
119      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
120      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
121    }
122
123    /// <summary>
124    /// Parses a file with the given formats
125    /// </summary>
126    /// <param name="fileName">file which is parsed</param>
127    /// <param name="numberFormat">Format of numbers</param>
128    /// <param name="dateTimeFormatInfo">Format of datetime</param>
129    /// <param name="separator">defines the separator</param>
130    /// <param name="columnNamesInFirstLine"></param>
131    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
132      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
133        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
134      }
135    }
136
137    /// <summary>
138    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
139    /// </summary>
140    /// <param name="stream">stream which is parsed</param>
141    /// <param name="columnNamesInFirstLine"></param>
142    public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
143      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
144      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
145      char separator = ',';
146      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
147    }
148
149    /// <summary>
150    /// Parses a stream with the given formats.
151    /// </summary>
152    /// <param name="stream">Stream which is parsed</param>   
153    /// <param name="numberFormat">Format of numbers</param>
154    /// <param name="dateTimeFormatInfo">Format of datetime</param>
155    /// <param name="separator">defines the separator</param>
156    /// <param name="columnNamesInFirstLine"></param>
157    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
158      using (StreamReader reader = new StreamReader(stream)) {
159        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
160        // parse the file
161        Parse(columnNamesInFirstLine, lineLimit);
162      }
163
164      // translate the list of samples into a DoubleMatrixData item
165      rows = rowValues.Count;
166      columns = rowValues[0].Count;
167      values = new List<IList>();
168
169      //create columns
170      for (int col = 0; col < columns; col++) {
171        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType());
172        if (!types.Any()) {
173          values.Add(new List<string>());
174          continue;
175        }
176
177        var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
178        if (columnType == typeof(double)) values.Add(new List<double>());
179        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
180        else if (columnType == typeof(string)) values.Add(new List<string>());
181        else throw new InvalidOperationException();
182      }
183
184
185
186      //fill with values
187      foreach (List<object> row in rowValues) {
188        int columnIndex = 0;
189        foreach (object element in row) {
190          if (values[columnIndex] is List<double> && !(element is double))
191            values[columnIndex].Add(double.NaN);
192          else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
193            values[columnIndex].Add(DateTime.MinValue);
194          else if (values[columnIndex] is List<string> && !(element is string))
195            values[columnIndex].Add(element.ToString());
196          else
197            values[columnIndex].Add(element);
198          columnIndex++;
199        }
200      }
201    }
202
203    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
204      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
205    }
206
207    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
208      using (StreamReader reader = new StreamReader(stream)) {
209        // skip first line
210        reader.ReadLine();
211        // read a block
212        char[] buffer = new char[BUFFER_SIZE];
213        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
214        // count frequency of special characters
215        Dictionary<char, int> charCounts = buffer.Take(charsRead)
216          .GroupBy(c => c)
217          .ToDictionary(g => g.Key, g => g.Count());
218
219        // depending on the characters occuring in the block
220        // we distinghish a number of different cases based on the the following rules:
221        // many points => it must be English number format, the other frequently occuring char is the separator
222        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
223        //   => check the line in more detail:
224        //            English: 0, 0, 0, 0
225        //            German:  0,0 0,0 0,0 ...
226        //            => if commas are followed by space => English format
227        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
228        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
229        if (OccurrencesOf(charCounts, '.') > 10) {
230          numberFormat = NumberFormatInfo.InvariantInfo;
231          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
232          separator = POSSIBLE_SEPARATORS
233            .Where(c => OccurrencesOf(charCounts, c) > 10)
234            .OrderBy(c => -OccurrencesOf(charCounts, c))
235            .DefaultIfEmpty(' ')
236            .First();
237        } else if (OccurrencesOf(charCounts, ',') > 10) {
238          // no points and many commas
239          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
240          int tokensWithMultipleCommas = 0;
241          for (int i = 0; i < charsRead; i++) {
242            int nCommas = 0;
243            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
244              if (buffer[i] == ',') nCommas++;
245              i++;
246            }
247            if (nCommas > 2) tokensWithMultipleCommas++;
248          }
249          if (tokensWithMultipleCommas > 1) {
250            // English format (only integer values) with ',' as separator
251            numberFormat = NumberFormatInfo.InvariantInfo;
252            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
253            separator = ',';
254          } else {
255            char[] disallowedSeparators = new char[] { ',' };
256            // German format (real values)
257            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
258            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
259            separator = POSSIBLE_SEPARATORS
260              .Except(disallowedSeparators)
261              .Where(c => OccurrencesOf(charCounts, c) > 10)
262              .OrderBy(c => -OccurrencesOf(charCounts, c))
263              .DefaultIfEmpty(' ')
264              .First();
265          }
266        } else {
267          // no points and no commas => English format
268          numberFormat = NumberFormatInfo.InvariantInfo;
269          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
270          separator = POSSIBLE_SEPARATORS
271            .Where(c => OccurrencesOf(charCounts, c) > 10)
272            .OrderBy(c => -OccurrencesOf(charCounts, c))
273            .DefaultIfEmpty(' ')
274            .First();
275        }
276      }
277    }
278
279    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
280      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
281    }
282
283    #region tokenizer
284    internal enum TokenTypeEnum {
285      NewLine, Separator, String, Double, DateTime
286    }
287
288    internal class Tokenizer {
289      private StreamReader reader;
290      // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
291      private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
292      private string[] stringVals = new string[1024];
293      private double[] doubleVals = new double[1024];
294      private DateTime[] dateTimeVals = new DateTime[1024];
295      private int tokenPos;
296      private int numTokens;
297      private NumberFormatInfo numberFormatInfo;
298      private DateTimeFormatInfo dateTimeFormatInfo;
299      private char separator;
300      private const string INTERNAL_SEPARATOR = "#";
301
302      private int currentLineNumber = 0;
303      public int CurrentLineNumber {
304        get { return currentLineNumber; }
305        private set { currentLineNumber = value; }
306      }
307      private string currentLine;
308      public string CurrentLine {
309        get { return currentLine; }
310        private set { currentLine = value; }
311      }
312
313      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
314        this.reader = reader;
315        this.numberFormatInfo = numberFormatInfo;
316        this.dateTimeFormatInfo = dateTimeFormatInfo;
317        this.separator = separator;
318        ReadNextTokens();
319      }
320
321      private void ReadNextTokens() {
322        if (!reader.EndOfStream) {
323          CurrentLine = reader.ReadLine();
324          int i = 0;
325          foreach (var tok in Split(CurrentLine)) {
326            var trimmedStr = tok.Trim();
327            if (!string.IsNullOrEmpty(trimmedStr)) {
328              TokenTypeEnum type = TokenTypeEnum.String; // default
329              stringVals[i] = trimmedStr;
330              double doubleVal;
331              DateTime dateTimeValue;
332              if (trimmedStr.Equals(INTERNAL_SEPARATOR)) {
333                type = TokenTypeEnum.Separator;
334              } else if (double.TryParse(trimmedStr, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
335                type = TokenTypeEnum.Double;
336                doubleVals[i] = doubleVal;
337              } else if (DateTime.TryParse(trimmedStr, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
338                type = TokenTypeEnum.DateTime;
339                dateTimeVals[i] = dateTimeValue;
340              }
341
342              // couldn't parse the token as an int or float number  or datetime value so return a string token
343
344              tokenTypes[i] = type;
345              i++;
346
347              if (i >= tokenTypes.Length) {
348                // increase buffer size if necessary
349                IncreaseCapacity(ref tokenTypes);
350                IncreaseCapacity(ref doubleVals);
351                IncreaseCapacity(ref stringVals);
352                IncreaseCapacity(ref dateTimeVals);
353              }
354            }
355          }
356          tokenTypes[i] = TokenTypeEnum.NewLine;
357          numTokens = i + 1;
358          tokenPos = 0;
359        }
360      }
361
362      private static void IncreaseCapacity<T>(ref T[] arr) {
363        int n = (int)Math.Floor(arr.Length * 1.7); // guess
364        T[] arr2 = new T[n];
365        Array.Copy(arr, arr2, arr.Length);
366        arr = arr2;
367      }
368
369      private IEnumerable<string> Split(string line) {
370        string[] splitString;
371        if (separator == WHITESPACECHAR) {
372          //separate whitespaces
373          splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
374        } else {
375          splitString = line.Split(separator);
376        }
377
378        for (int i = 0; i < splitString.Length - 1; i++) {
379          yield return splitString[i];
380          yield return INTERNAL_SEPARATOR;
381        }
382        // do not return the INTERNAL_SEPARATOR after the last string
383        yield return splitString[splitString.Length - 1];
384      }
385
386      public TokenTypeEnum PeekType() {
387        return tokenTypes[tokenPos];
388      }
389
390      public void Skip() {
391        // simply skips one token without returning the result values
392        tokenPos++;
393        if (numTokens == tokenPos) {
394          ReadNextTokens();
395        }
396      }
397
398      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
399        type = tokenTypes[tokenPos];
400        strVal = stringVals[tokenPos];
401        dblVal = doubleVals[tokenPos];
402        dateTimeVal = dateTimeVals[tokenPos];
403
404        Skip();
405      }
406
407      public bool HasNext() {
408        return numTokens > tokenPos || !reader.EndOfStream;
409      }
410    }
411    #endregion
412
413    #region parsing
414    private void Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit
415      if (columnNamesInFirstLine) {
416        ParseVariableNames();
417        if (!tokenizer.HasNext())
418          Error(
419            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
420            "", tokenizer.CurrentLineNumber);
421      }
422      ParseValues(lineLimit);
423      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
424    }
425
426    private void ParseValues(int lineLimit = -1) {
427      int nLinesParsed = 0;
428      while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
429        if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
430          tokenizer.Skip();
431          nLinesParsed++;
432        } else {
433          List<object> row = new List<object>();
434          object value = NextValue(tokenizer);
435          row.Add(value);
436          while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
437            ExpectType(TokenTypeEnum.Separator);
438            row.Add(NextValue(tokenizer));
439          }
440          ExpectType(TokenTypeEnum.NewLine);
441          nLinesParsed++;
442          // all rows have to have the same number of values           
443          // the first row defines how many samples are needed
444          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
445            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
446                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
447                  tokenizer.CurrentLineNumber);
448          }
449          rowValues.Add(row);
450        }
451      }
452    }
453
454    private object NextValue(Tokenizer tokenizer) {
455      if (tokenizer.PeekType() == TokenTypeEnum.Separator || tokenizer.PeekType() == TokenTypeEnum.NewLine) return string.Empty;
456      TokenTypeEnum type;
457      string strVal;
458      double dblVal;
459      DateTime dateTimeVal;
460
461      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
462      switch (type) {
463        case TokenTypeEnum.Separator: return double.NaN;
464        case TokenTypeEnum.String: return strVal;
465        case TokenTypeEnum.Double: return dblVal;
466        case TokenTypeEnum.DateTime: return dateTimeVal;
467      }
468      // found an unexpected token => throw error
469      Error("Unexpected token.", strVal, tokenizer.CurrentLineNumber);
470      // this line is never executed because Error() throws an exception
471      throw new InvalidOperationException();
472    }
473
474    private void ParseVariableNames() {
475      // the first line must contain variable names
476      List<string> varNames = new List<string>();
477
478      TokenTypeEnum type;
479      string strVal;
480      double dblVal;
481      DateTime dateTimeVal;
482
483      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
484
485      // the first token must be a variable name
486      if (type != TokenTypeEnum.String)
487        throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
488      varNames.Add(strVal);
489
490      while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
491        ExpectType(TokenTypeEnum.Separator);
492        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
493        varNames.Add(strVal);
494      }
495      ExpectType(TokenTypeEnum.NewLine);
496
497      variableNames = varNames;
498    }
499
500    private void ExpectType(TokenTypeEnum expectedToken) {
501      if (tokenizer.PeekType() != expectedToken)
502        throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
503      tokenizer.Skip();
504    }
505
506    private void Error(string message, string token, int lineNumber) {
507      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
508    }
509    #endregion
510
511    [Serializable]
512    public class DataFormatException : Exception {
513      private int line;
514      public int Line {
515        get { return line; }
516      }
517      private string token;
518      public string Token {
519        get { return token; }
520      }
521      public DataFormatException(string message, string token, int line)
522        : base(message + "\nToken: " + token + " (line: " + line + ")") {
523        this.token = token;
524        this.line = line;
525      }
526
527      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
528    }
529  }
530}
Note: See TracBrowser for help on using the repository browser.