Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 13414

Last change on this file since 13414 was 13414, checked in by gkronber, 8 years ago

#2071: added progress reporting when importing regression problem data from csv files.

File size: 22.3 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Globalization;
27using System.IO;
28using System.Linq;
29using System.Runtime.Serialization;
30
31namespace HeuristicLab.Problems.Instances.DataAnalysis {
32  public class TableFileParser : Progress<long> { // reports the number of bytes read
33    private const int BUFFER_SIZE = 65536;
34    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
35    private const char WHITESPACECHAR = (char)0;
36    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
37    private Tokenizer tokenizer;
38    private List<List<object>> rowValues;
39
40    private int rows;
41    public int Rows {
42      get { return rows; }
43      set { rows = value; }
44    }
45
46    private int columns;
47    public int Columns {
48      get { return columns; }
49      set { columns = value; }
50    }
51
52    private List<IList> values;
53    public List<IList> Values {
54      get {
55        return values;
56      }
57    }
58
59    private List<string> variableNames;
60    public IEnumerable<string> VariableNames {
61      get {
62        if (variableNames.Count > 0) return variableNames;
63        else {
64          string[] names = new string[columns];
65          for (int i = 0; i < names.Length; i++) {
66            names[i] = "X" + i.ToString("000");
67          }
68          return names;
69        }
70      }
71    }
72
73    public TableFileParser() {
74      rowValues = new List<List<object>>();
75      variableNames = new List<string>();
76    }
77
78    public bool AreColumnNamesInFirstLine(string fileName) {
79      NumberFormatInfo numberFormat;
80      DateTimeFormatInfo dateTimeFormatInfo;
81      char separator;
82      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
83      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
84        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
85      }
86    }
87
88    public bool AreColumnNamesInFirstLine(Stream stream) {
89      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
90      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
91      char separator = ',';
92      return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
93    }
94
95    public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
96                                         DateTimeFormatInfo dateTimeFormatInfo, char separator) {
97      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
98        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
99      }
100    }
101
102    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
103                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
104      using (StreamReader reader = new StreamReader(stream)) {
105        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
106        return tokenizer.PeekType() != TokenTypeEnum.Double;
107      }
108    }
109
110    /// <summary>
111    /// Parses a file and determines the format first
112    /// </summary>
113    /// <param name="fileName">file which is parsed</param>
114    /// <param name="columnNamesInFirstLine"></param>
115    public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
116      NumberFormatInfo numberFormat;
117      DateTimeFormatInfo dateTimeFormatInfo;
118      char separator;
119      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
120      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
121    }
122
123    /// <summary>
124    /// Parses a file with the given formats
125    /// </summary>
126    /// <param name="fileName">file which is parsed</param>
127    /// <param name="numberFormat">Format of numbers</param>
128    /// <param name="dateTimeFormatInfo">Format of datetime</param>
129    /// <param name="separator">defines the separator</param>
130    /// <param name="columnNamesInFirstLine"></param>
131    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
132      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
133        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
134      }
135    }
136
137    /// <summary>
138    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
139    /// </summary>
140    /// <param name="stream">stream which is parsed</param>
141    /// <param name="columnNamesInFirstLine"></param>
142    public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
143      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
144      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
145      char separator = ',';
146      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
147    }
148
149    /// <summary>
150    /// Parses a stream with the given formats.
151    /// </summary>
152    /// <param name="stream">Stream which is parsed</param>   
153    /// <param name="numberFormat">Format of numbers</param>
154    /// <param name="dateTimeFormatInfo">Format of datetime</param>
155    /// <param name="separator">defines the separator</param>
156    /// <param name="columnNamesInFirstLine"></param>
157    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
158      using (StreamReader reader = new StreamReader(stream)) {
159        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
160        // parse the file
161        Parse(columnNamesInFirstLine, lineLimit);
162      }
163
164      // translate the list of samples into a DoubleMatrixData item
165      rows = rowValues.Count;
166      columns = rowValues[0].Count;
167      values = new List<IList>();
168
169      //create columns
170      for (int col = 0; col < columns; col++) {
171        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType());
172        if (!types.Any()) {
173          values.Add(new List<string>());
174          continue;
175        }
176
177        var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
178        if (columnType == typeof(double)) values.Add(new List<double>());
179        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
180        else if (columnType == typeof(string)) values.Add(new List<string>());
181        else throw new InvalidOperationException();
182      }
183
184
185
186      //fill with values
187      foreach (List<object> row in rowValues) {
188        int columnIndex = 0;
189        foreach (object element in row) {
190          if (values[columnIndex] is List<double> && !(element is double))
191            values[columnIndex].Add(double.NaN);
192          else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
193            values[columnIndex].Add(DateTime.MinValue);
194          else if (values[columnIndex] is List<string> && !(element is string))
195            values[columnIndex].Add(element.ToString());
196          else
197            values[columnIndex].Add(element);
198          columnIndex++;
199        }
200      }
201    }
202
203    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
204      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
205    }
206
207    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
208      using (StreamReader reader = new StreamReader(stream)) {
209        // skip first line
210        reader.ReadLine();
211        // read a block
212        char[] buffer = new char[BUFFER_SIZE];
213        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
214        // count frequency of special characters
215        Dictionary<char, int> charCounts = buffer.Take(charsRead)
216          .GroupBy(c => c)
217          .ToDictionary(g => g.Key, g => g.Count());
218
219        // depending on the characters occuring in the block
220        // we distinghish a number of different cases based on the the following rules:
221        // many points => it must be English number format, the other frequently occuring char is the separator
222        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
223        //   => check the line in more detail:
224        //            English: 0, 0, 0, 0
225        //            German:  0,0 0,0 0,0 ...
226        //            => if commas are followed by space => English format
227        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
228        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
229        if (OccurrencesOf(charCounts, '.') > 10) {
230          numberFormat = NumberFormatInfo.InvariantInfo;
231          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
232          separator = POSSIBLE_SEPARATORS
233            .Where(c => OccurrencesOf(charCounts, c) > 10)
234            .OrderBy(c => -OccurrencesOf(charCounts, c))
235            .DefaultIfEmpty(' ')
236            .First();
237        } else if (OccurrencesOf(charCounts, ',') > 10) {
238          // no points and many commas
239          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
240          int tokensWithMultipleCommas = 0;
241          for (int i = 0; i < charsRead; i++) {
242            int nCommas = 0;
243            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
244              if (buffer[i] == ',') nCommas++;
245              i++;
246            }
247            if (nCommas > 2) tokensWithMultipleCommas++;
248          }
249          if (tokensWithMultipleCommas > 1) {
250            // English format (only integer values) with ',' as separator
251            numberFormat = NumberFormatInfo.InvariantInfo;
252            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
253            separator = ',';
254          } else {
255            char[] disallowedSeparators = new char[] { ',' };
256            // German format (real values)
257            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
258            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
259            separator = POSSIBLE_SEPARATORS
260              .Except(disallowedSeparators)
261              .Where(c => OccurrencesOf(charCounts, c) > 10)
262              .OrderBy(c => -OccurrencesOf(charCounts, c))
263              .DefaultIfEmpty(' ')
264              .First();
265          }
266        } else {
267          // no points and no commas => English format
268          numberFormat = NumberFormatInfo.InvariantInfo;
269          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
270          separator = POSSIBLE_SEPARATORS
271            .Where(c => OccurrencesOf(charCounts, c) > 10)
272            .OrderBy(c => -OccurrencesOf(charCounts, c))
273            .DefaultIfEmpty(' ')
274            .First();
275        }
276      }
277    }
278
279    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
280      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
281    }
282
283    #region tokenizer
284    internal enum TokenTypeEnum {
285      NewLine, Separator, String, Double, DateTime
286    }
287
288    internal class Tokenizer {
289      private StreamReader reader;
290      // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
291      private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
292      private string[] stringVals = new string[1024];
293      private double[] doubleVals = new double[1024];
294      private DateTime[] dateTimeVals = new DateTime[1024];
295      private int tokenPos;
296      private int numTokens;
297      private NumberFormatInfo numberFormatInfo;
298      private DateTimeFormatInfo dateTimeFormatInfo;
299      private char separator;
300      private const string INTERNAL_SEPARATOR = "#";
301
302      private int currentLineNumber = 0;
303      public int CurrentLineNumber {
304        get { return currentLineNumber; }
305        private set { currentLineNumber = value; }
306      }
307      private string currentLine;
308      public string CurrentLine {
309        get { return currentLine; }
310        private set { currentLine = value; }
311      }
312      public long BytesRead {
313        get;
314        private set;
315      }
316
317
318      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
319        this.reader = reader;
320        this.numberFormatInfo = numberFormatInfo;
321        this.dateTimeFormatInfo = dateTimeFormatInfo;
322        this.separator = separator;
323        ReadNextTokens();
324      }
325
326      private void ReadNextTokens() {
327        if (!reader.EndOfStream) {
328          CurrentLine = reader.ReadLine();
329          try {
330            BytesRead = reader.BaseStream.Position;
331          } catch (IOException) {
332            BytesRead += CurrentLine.Length + 2; // guess
333          } catch (NotSupportedException) {
334            BytesRead += CurrentLine.Length + 2;
335          }
336          int i = 0;
337          foreach (var tok in Split(CurrentLine)) {
338            var trimmedStr = tok.Trim();
339            if (!string.IsNullOrEmpty(trimmedStr)) {
340              TokenTypeEnum type = TokenTypeEnum.String; // default
341              stringVals[i] = trimmedStr;
342              double doubleVal;
343              DateTime dateTimeValue;
344              if (trimmedStr.Equals(INTERNAL_SEPARATOR)) {
345                type = TokenTypeEnum.Separator;
346              } else if (double.TryParse(trimmedStr, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
347                type = TokenTypeEnum.Double;
348                doubleVals[i] = doubleVal;
349              } else if (DateTime.TryParse(trimmedStr, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
350                type = TokenTypeEnum.DateTime;
351                dateTimeVals[i] = dateTimeValue;
352              }
353
354              // couldn't parse the token as an int or float number  or datetime value so return a string token
355
356              tokenTypes[i] = type;
357              i++;
358
359              if (i >= tokenTypes.Length) {
360                // increase buffer size if necessary
361                IncreaseCapacity(ref tokenTypes);
362                IncreaseCapacity(ref doubleVals);
363                IncreaseCapacity(ref stringVals);
364                IncreaseCapacity(ref dateTimeVals);
365              }
366            }
367          }
368          tokenTypes[i] = TokenTypeEnum.NewLine;
369          numTokens = i + 1;
370          tokenPos = 0;
371        }
372      }
373
374      private static void IncreaseCapacity<T>(ref T[] arr) {
375        int n = (int)Math.Floor(arr.Length * 1.7); // guess
376        T[] arr2 = new T[n];
377        Array.Copy(arr, arr2, arr.Length);
378        arr = arr2;
379      }
380
381      private IEnumerable<string> Split(string line) {
382        string[] splitString;
383        if (separator == WHITESPACECHAR) {
384          //separate whitespaces
385          splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
386        } else {
387          splitString = line.Split(separator);
388        }
389
390        for (int i = 0; i < splitString.Length - 1; i++) {
391          yield return splitString[i];
392          yield return INTERNAL_SEPARATOR;
393        }
394        // do not return the INTERNAL_SEPARATOR after the last string
395        yield return splitString[splitString.Length - 1];
396      }
397
398      public TokenTypeEnum PeekType() {
399        return tokenTypes[tokenPos];
400      }
401
402      public void Skip() {
403        // simply skips one token without returning the result values
404        tokenPos++;
405        if (numTokens == tokenPos) {
406          ReadNextTokens();
407        }
408      }
409
410      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
411        type = tokenTypes[tokenPos];
412        strVal = stringVals[tokenPos];
413        dblVal = doubleVals[tokenPos];
414        dateTimeVal = dateTimeVals[tokenPos];
415
416        Skip();
417      }
418
419      public bool HasNext() {
420        return numTokens > tokenPos || !reader.EndOfStream;
421      }
422    }
423    #endregion
424
425    #region parsing
426    private void Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit
427      if (columnNamesInFirstLine) {
428        ParseVariableNames();
429        if (!tokenizer.HasNext())
430          Error(
431            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
432            "", tokenizer.CurrentLineNumber);
433      }
434      ParseValues(lineLimit);
435      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
436    }
437
438    private void ParseValues(int lineLimit = -1) {
439      int nLinesParsed = 0;
440      while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
441        if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
442          tokenizer.Skip();
443          nLinesParsed++;
444        } else {
445          List<object> row = new List<object>();
446          object value = NextValue(tokenizer);
447          row.Add(value);
448          while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
449            ExpectType(TokenTypeEnum.Separator);
450            row.Add(NextValue(tokenizer));
451          }
452          ExpectType(TokenTypeEnum.NewLine);
453          nLinesParsed++;
454          // all rows have to have the same number of values           
455          // the first row defines how many samples are needed
456          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
457            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
458                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
459                  tokenizer.CurrentLineNumber);
460          }
461          rowValues.Add(row);
462        }
463
464        OnReport(tokenizer.BytesRead);
465      }
466    }
467
468    private object NextValue(Tokenizer tokenizer) {
469      if (tokenizer.PeekType() == TokenTypeEnum.Separator || tokenizer.PeekType() == TokenTypeEnum.NewLine) return string.Empty;
470      TokenTypeEnum type;
471      string strVal;
472      double dblVal;
473      DateTime dateTimeVal;
474
475      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
476      switch (type) {
477        case TokenTypeEnum.Separator: return double.NaN;
478        case TokenTypeEnum.String: return strVal;
479        case TokenTypeEnum.Double: return dblVal;
480        case TokenTypeEnum.DateTime: return dateTimeVal;
481      }
482      // found an unexpected token => throw error
483      Error("Unexpected token.", strVal, tokenizer.CurrentLineNumber);
484      // this line is never executed because Error() throws an exception
485      throw new InvalidOperationException();
486    }
487
488    private void ParseVariableNames() {
489      // the first line must contain variable names
490      List<string> varNames = new List<string>();
491
492      TokenTypeEnum type;
493      string strVal;
494      double dblVal;
495      DateTime dateTimeVal;
496
497      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
498
499      // the first token must be a variable name
500      if (type != TokenTypeEnum.String)
501        throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
502      varNames.Add(strVal);
503
504      while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
505        ExpectType(TokenTypeEnum.Separator);
506        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
507        varNames.Add(strVal);
508      }
509      ExpectType(TokenTypeEnum.NewLine);
510
511      variableNames = varNames;
512    }
513
514    private void ExpectType(TokenTypeEnum expectedToken) {
515      if (tokenizer.PeekType() != expectedToken)
516        throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
517      tokenizer.Skip();
518    }
519
520    private void Error(string message, string token, int lineNumber) {
521      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
522    }
523    #endregion
524
525    [Serializable]
526    public class DataFormatException : Exception {
527      private int line;
528      public int Line {
529        get { return line; }
530      }
531      private string token;
532      public string Token {
533        get { return token; }
534      }
535      public DataFormatException(string message, string token, int line)
536        : base(message + "\nToken: " + token + " (line: " + line + ")") {
537        this.token = token;
538        this.line = line;
539      }
540
541      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
542    }
543  }
544}
Note: See TracBrowser for help on using the repository browser.