source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 9608

Last change on this file since 9608 was 9608, checked in by sforsten, 9 years ago

#2070:

  • changed parse methods in TableFileParser to accept a bool which defines, if the first line contains variable names
  • added methods in TableFileParser to check if the first line contains variable names
  • adapted unit tests
  • adapted DataAnalysisImportTypeDialog so that a checkbox can be set to define if the first line contains variable names
  • added the flag NumberStyles.AllowTrailingSign for parsing doubles
File size: 20.7 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Globalization;
27using System.IO;
28using System.Linq;
29using System.Runtime.Serialization;
30using System.Text;
31
32namespace HeuristicLab.Problems.Instances.DataAnalysis {
33  public class TableFileParser {
34    private const int BUFFER_SIZE = 65536;
35    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
36    private Tokenizer tokenizer;
37    private List<List<object>> rowValues;
38
39    private int rows;
40    public int Rows {
41      get { return rows; }
42      set { rows = value; }
43    }
44
45    private int columns;
46    public int Columns {
47      get { return columns; }
48      set { columns = value; }
49    }
50
51    private List<IList> values;
52    public List<IList> Values {
53      get {
54        return values;
55      }
56    }
57
58    private List<string> variableNames;
59    public IEnumerable<string> VariableNames {
60      get {
61        if (variableNames.Count > 0) return variableNames;
62        else {
63          string[] names = new string[columns];
64          for (int i = 0; i < names.Length; i++) {
65            names[i] = "X" + i.ToString("000");
66          }
67          return names;
68        }
69      }
70    }
71
72    public TableFileParser() {
73      rowValues = new List<List<object>>();
74      variableNames = new List<string>();
75    }
76
77    public bool AreColumnNamesInFirstLine(string fileName) {
78      NumberFormatInfo numberFormat;
79      DateTimeFormatInfo dateTimeFormatInfo;
80      char separator;
81      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
82      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
83        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
84      }
85    }
86
87    public bool AreColumnNamesInFirstLine(Stream stream) {
88      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
89      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
90      char separator = ',';
91      return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
92    }
93
94    public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
95                                         DateTimeFormatInfo dateTimeFormatInfo, char separator) {
96      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
97        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
98      }
99    }
100
101    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
102                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
103      using (StreamReader reader = new StreamReader(stream)) {
104        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
105        return tokenizer.Peek().type != TokenTypeEnum.Double;
106      }
107    }
108
109    /// <summary>
110    /// Parses a file and determines the format first
111    /// </summary>
112    /// <param name="fileName">file which is parsed</param>
113    /// <param name="columnNamesInFirstLine"></param>
114    public void Parse(string fileName, bool columnNamesInFirstLine) {
115      NumberFormatInfo numberFormat;
116      DateTimeFormatInfo dateTimeFormatInfo;
117      char separator;
118      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
119      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
120    }
121
122    /// <summary>
123    /// Parses a file with the given formats
124    /// </summary>
125    /// <param name="fileName">file which is parsed</param>
126    /// <param name="numberFormat">Format of numbers</param>
127    /// <param name="dateTimeFormatInfo">Format of datetime</param>
128    /// <param name="separator">defines the separator</param>
129    /// <param name="columnNamesInFirstLine"></param>
130    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
131      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
132        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
133      }
134    }
135
136    /// <summary>
137    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
138    /// </summary>
139    /// <param name="stream">stream which is parsed</param>
140    /// <param name="columnNamesInFirstLine"></param>
141    public void Parse(Stream stream, bool columnNamesInFirstLine) {
142      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
143      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
144      char separator = ',';
145      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
146    }
147
148    /// <summary>
149    /// Parses a stream with the given formats.
150    /// </summary>
151    /// <param name="stream">Stream which is parsed</param>   
152    /// <param name="numberFormat">Format of numbers</param>
153    /// <param name="dateTimeFormatInfo">Format of datetime</param>
154    /// <param name="separator">defines the separator</param>
155    /// <param name="columnNamesInFirstLine"></param>
156    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
157      using (StreamReader reader = new StreamReader(stream)) {
158        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
159        // parse the file
160        Parse(columnNamesInFirstLine);
161      }
162
163      // translate the list of samples into a DoubleMatrixData item
164      rows = rowValues.Count;
165      columns = rowValues[0].Count;
166      values = new List<IList>();
167
168      //create columns
169      for (int col = 0; col < columns; col++) {
170        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
171        if (!types.Any()) {
172          values.Add(new List<string>());
173          continue;
174        }
175
176        var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
177        if (columnType == typeof(double)) values.Add(new List<double>());
178        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
179        else if (columnType == typeof(string)) values.Add(new List<string>());
180        else throw new InvalidOperationException();
181      }
182
183
184
185      //fill with values
186      foreach (List<object> row in rowValues) {
187        int columnIndex = 0;
188        foreach (object element in row) {
189          if (values[columnIndex] is List<double> && !(element is double))
190            values[columnIndex].Add(double.NaN);
191          else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
192            values[columnIndex].Add(DateTime.MinValue);
193          else if (values[columnIndex] is List<string> && !(element is string))
194            values[columnIndex].Add(string.Empty);
195          else
196            values[columnIndex].Add(element);
197          columnIndex++;
198        }
199      }
200    }
201
202    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
203      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
204    }
205
206    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
207      using (StreamReader reader = new StreamReader(stream)) {
208        // skip first line
209        reader.ReadLine();
210        // read a block
211        char[] buffer = new char[BUFFER_SIZE];
212        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
213        // count frequency of special characters
214        Dictionary<char, int> charCounts = buffer.Take(charsRead)
215          .GroupBy(c => c)
216          .ToDictionary(g => g.Key, g => g.Count());
217
218        // depending on the characters occuring in the block
219        // we distinghish a number of different cases based on the the following rules:
220        // many points => it must be English number format, the other frequently occuring char is the separator
221        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
222        //   => check the line in more detail:
223        //            English: 0, 0, 0, 0
224        //            German:  0,0 0,0 0,0 ...
225        //            => if commas are followed by space => English format
226        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
227        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
228        if (OccurrencesOf(charCounts, '.') > 10) {
229          numberFormat = NumberFormatInfo.InvariantInfo;
230          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
231          separator = POSSIBLE_SEPARATORS
232            .Where(c => OccurrencesOf(charCounts, c) > 10)
233            .OrderBy(c => -OccurrencesOf(charCounts, c))
234            .DefaultIfEmpty(' ')
235            .First();
236        } else if (OccurrencesOf(charCounts, ',') > 10) {
237          // no points and many commas
238          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
239          int tokensWithMultipleCommas = 0;
240          for (int i = 0; i < charsRead; i++) {
241            int nCommas = 0;
242            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
243              if (buffer[i] == ',') nCommas++;
244              i++;
245            }
246            if (nCommas > 2) tokensWithMultipleCommas++;
247          }
248          if (tokensWithMultipleCommas > 1) {
249            // English format (only integer values) with ',' as separator
250            numberFormat = NumberFormatInfo.InvariantInfo;
251            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
252            separator = ',';
253          } else {
254            char[] disallowedSeparators = new char[] { ',' };
255            // German format (real values)
256            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
257            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
258            separator = POSSIBLE_SEPARATORS
259              .Except(disallowedSeparators)
260              .Where(c => OccurrencesOf(charCounts, c) > 10)
261              .OrderBy(c => -OccurrencesOf(charCounts, c))
262              .DefaultIfEmpty(' ')
263              .First();
264          }
265        } else {
266          // no points and no commas => English format
267          numberFormat = NumberFormatInfo.InvariantInfo;
268          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
269          separator = POSSIBLE_SEPARATORS
270            .Where(c => OccurrencesOf(charCounts, c) > 10)
271            .OrderBy(c => -OccurrencesOf(charCounts, c))
272            .DefaultIfEmpty(' ')
273            .First();
274        }
275      }
276    }
277
278    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
279      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
280    }
281
282    #region tokenizer
283    internal enum TokenTypeEnum {
284      NewLine, Separator, String, Double, DateTime
285    }
286
287    internal class Token {
288      public TokenTypeEnum type;
289      public string stringValue;
290      public double doubleValue;
291      public DateTime dateTimeValue;
292
293      public Token(TokenTypeEnum type, string value) {
294        this.type = type;
295        stringValue = value;
296        dateTimeValue = DateTime.MinValue;
297        doubleValue = 0.0;
298      }
299
300      public override string ToString() {
301        return stringValue;
302      }
303    }
304
305
306    internal class Tokenizer {
307      private StreamReader reader;
308      private List<Token> tokens;
309      private NumberFormatInfo numberFormatInfo;
310      private DateTimeFormatInfo dateTimeFormatInfo;
311      private char separator;
312      private const string INTERNAL_SEPARATOR = "#";
313
314      private int currentLineNumber = 0;
315      public int CurrentLineNumber {
316        get { return currentLineNumber; }
317        private set { currentLineNumber = value; }
318      }
319      private string currentLine;
320      public string CurrentLine {
321        get { return currentLine; }
322        private set { currentLine = value; }
323      }
324
325      private Token newlineToken;
326      public Token NewlineToken {
327        get { return newlineToken; }
328        private set { newlineToken = value; }
329      }
330      private Token separatorToken;
331      public Token SeparatorToken {
332        get { return separatorToken; }
333        private set { separatorToken = value; }
334      }
335
336      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
337        this.reader = reader;
338        this.numberFormatInfo = numberFormatInfo;
339        this.dateTimeFormatInfo = dateTimeFormatInfo;
340        this.separator = separator;
341        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
342        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
343        tokens = new List<Token>();
344        ReadNextTokens();
345      }
346
347      private void ReadNextTokens() {
348        if (!reader.EndOfStream) {
349          CurrentLine = reader.ReadLine();
350          var newTokens = from str in Split(CurrentLine)
351                          let trimmedStr = str.Trim()
352                          where !string.IsNullOrEmpty(trimmedStr)
353                          select MakeToken(trimmedStr);
354
355          tokens.AddRange(newTokens);
356          tokens.Add(NewlineToken);
357          CurrentLineNumber++;
358        }
359      }
360
361      private IEnumerable<string> Split(string line) {
362        StringBuilder subStr = new StringBuilder();
363        foreach (char c in line) {
364          if (c == separator) {
365            yield return subStr.ToString();
366            subStr = new StringBuilder();
367            // all separator characters are transformed to the internally used separator character
368            yield return INTERNAL_SEPARATOR;
369          } else {
370            subStr.Append(c);
371          }
372        }
373        yield return subStr.ToString();
374      }
375
376      private Token MakeToken(string strToken) {
377        Token token = new Token(TokenTypeEnum.String, strToken);
378        if (strToken.Equals(INTERNAL_SEPARATOR)) {
379          return SeparatorToken;
380        } else if (double.TryParse(strToken, NumberStyles.Float | NumberStyles.AllowTrailingSign, numberFormatInfo, out token.doubleValue)) {
381          token.type = TokenTypeEnum.Double;
382          return token;
383        } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
384          token.type = TokenTypeEnum.DateTime;
385          return token;
386        }
387
388        // couldn't parse the token as an int or float number  or datetime value so return a string token
389        return token;
390      }
391
392      public Token Peek() {
393        return tokens[0];
394      }
395
396      public Token Next() {
397        Token next = tokens[0];
398        tokens.RemoveAt(0);
399        if (tokens.Count == 0) {
400          ReadNextTokens();
401        }
402        return next;
403      }
404
405      public bool HasNext() {
406        return tokens.Count > 0 || !reader.EndOfStream;
407      }
408    }
409    #endregion
410
411    #region parsing
412    private void Parse(bool columnNamesInFirstLine) {
413      if (columnNamesInFirstLine) {
414        ParseVariableNames();
415        if (!tokenizer.HasNext())
416          Error(
417            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
418            "", tokenizer.CurrentLineNumber);
419      }
420      ParseValues();
421      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
422    }
423
424    private void ParseValues() {
425      while (tokenizer.HasNext()) {
426        if (tokenizer.Peek() == tokenizer.NewlineToken) {
427          tokenizer.Next();
428        } else {
429          List<object> row = new List<object>();
430          object value = NextValue(tokenizer);
431          row.Add(value);
432          while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
433            Expect(tokenizer.SeparatorToken);
434            row.Add(NextValue(tokenizer));
435          }
436          Expect(tokenizer.NewlineToken);
437          // all rows have to have the same number of values           
438          // the first row defines how many samples are needed
439          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
440            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
441                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
442                  tokenizer.CurrentLineNumber);
443          }
444          rowValues.Add(row);
445        }
446      }
447    }
448
449    private object NextValue(Tokenizer tokenizer) {
450      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
451      Token current = tokenizer.Next();
452      if (current.type == TokenTypeEnum.Separator) {
453        return double.NaN;
454      } else if (current.type == TokenTypeEnum.String) {
455        return current.stringValue;
456      } else if (current.type == TokenTypeEnum.Double) {
457        return current.doubleValue;
458      } else if (current.type == TokenTypeEnum.DateTime) {
459        return current.dateTimeValue;
460      }
461      // found an unexpected token => throw error
462      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
463      // this line is never executed because Error() throws an exception
464      throw new InvalidOperationException();
465    }
466
467    private void ParseVariableNames() {
468      // the first line must contain variable names
469      List<Token> tokens = new List<Token>();
470      Token valueToken;
471      valueToken = tokenizer.Next();
472      tokens.Add(valueToken);
473      while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
474        Expect(tokenizer.SeparatorToken);
475        valueToken = tokenizer.Next();
476        if (valueToken != tokenizer.NewlineToken) {
477          tokens.Add(valueToken);
478        }
479      }
480      if (valueToken != tokenizer.NewlineToken) {
481        Expect(tokenizer.NewlineToken);
482      }
483      variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
484    }
485
486    private void Expect(Token expectedToken) {
487      Token actualToken = tokenizer.Next();
488      if (actualToken != expectedToken) {
489        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
490      }
491    }
492
493    private void Error(string message, string token, int lineNumber) {
494      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
495    }
496    #endregion
497
498    [Serializable]
499    public class DataFormatException : Exception {
500      private int line;
501      public int Line {
502        get { return line; }
503      }
504      private string token;
505      public string Token {
506        get { return token; }
507      }
508      public DataFormatException(string message, string token, int line)
509        : base(message + "\nToken: " + token + " (line: " + line + ")") {
510        this.token = token;
511        this.line = line;
512      }
513
514      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
515    }
516  }
517}
Note: See TracBrowser for help on using the repository browser.