source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 7849

Last change on this file since 7849 was 7849, checked in by sforsten, 10 years ago

#1784:

  • added project HeuristicLab.Problem.Instances.DataAnalysis and deleted HeuristicLab.Problem.Instances.Classification and HeuristicLab.Problem.Instances.Regression
  • buttons are now big enough for the icons
File size: 16.8 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Globalization;
27using System.IO;
28using System.Linq;
29using System.Runtime.Serialization;
30using System.Text;
31
32namespace HeuristicLab.Problems.Instances.DataAnalysis {
33  public class TableFileParser {
34    private const int BUFFER_SIZE = 1024;
35    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
36    private Tokenizer tokenizer;
37    private List<List<object>> rowValues;
38
39    private int rows;
40    public int Rows {
41      get { return rows; }
42      set { rows = value; }
43    }
44
45    private int columns;
46    public int Columns {
47      get { return columns; }
48      set { columns = value; }
49    }
50
51    private List<IList> values;
52    public List<IList> Values {
53      get {
54        return values;
55      }
56    }
57
58    private List<string> variableNames;
59    public IEnumerable<string> VariableNames {
60      get {
61        if (variableNames.Count > 0) return variableNames;
62        else {
63          string[] names = new string[columns];
64          for (int i = 0; i < names.Length; i++) {
65            names[i] = "X" + i.ToString("000");
66          }
67          return names;
68        }
69      }
70    }
71
72    public TableFileParser() {
73      rowValues = new List<List<object>>();
74      variableNames = new List<string>();
75    }
76
77    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
78      Parse(new FileStream(fileName, FileMode.Open), numberFormat, dateTimeFormatInfo, separator);
79    }
80
81    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
82      using (StreamReader reader = new StreamReader(stream)) {
83        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
84        // parse the file
85        Parse();
86      }
87
88      // translate the list of samples into a DoubleMatrixData item
89      rows = rowValues.Count;
90      columns = rowValues[0].Count;
91      values = new List<IList>();
92
93      //create columns
94      for (int col = 0; col < columns; col++) {
95        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
96        if (!types.Any()) {
97          values.Add(new List<string>());
98          continue;
99        }
100
101        var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
102        if (columnType == typeof(double)) values.Add(new List<double>());
103        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
104        else if (columnType == typeof(string)) values.Add(new List<string>());
105        else throw new InvalidOperationException();
106      }
107
108
109
110      //fill with values
111      foreach (List<object> row in rowValues) {
112        int columnIndex = 0;
113        foreach (object element in row) {
114          if (values[columnIndex] is List<double> && !(element is double))
115            values[columnIndex].Add(double.NaN);
116          else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
117            values[columnIndex].Add(DateTime.MinValue);
118          else if (values[columnIndex] is List<string> && !(element is string))
119            values[columnIndex].Add(string.Empty);
120          else
121            values[columnIndex].Add(element);
122          columnIndex++;
123        }
124      }
125    }
126
127    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
128      DetermineFileFormat(new FileStream(path, FileMode.Open), out numberFormat, out dateTimeFormatInfo, out separator);
129    }
130
131    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
132      using (StreamReader reader = new StreamReader(stream)) {
133        // skip first line
134        reader.ReadLine();
135        // read a block
136        char[] buffer = new char[BUFFER_SIZE];
137        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
138        // count frequency of special characters
139        Dictionary<char, int> charCounts = buffer.Take(charsRead)
140          .GroupBy(c => c)
141          .ToDictionary(g => g.Key, g => g.Count());
142
143        // depending on the characters occuring in the block
144        // we distinghish a number of different cases based on the the following rules:
145        // many points => it must be English number format, the other frequently occuring char is the separator
146        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
147        //   => check the line in more detail:
148        //            English: 0, 0, 0, 0
149        //            German:  0,0 0,0 0,0 ...
150        //            => if commas are followed by space => English format
151        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
152        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
153        if (OccurrencesOf(charCounts, '.') > 10) {
154          numberFormat = NumberFormatInfo.InvariantInfo;
155          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
156          separator = POSSIBLE_SEPARATORS
157            .Where(c => OccurrencesOf(charCounts, c) > 10)
158            .OrderBy(c => -OccurrencesOf(charCounts, c))
159            .DefaultIfEmpty(' ')
160            .First();
161        } else if (OccurrencesOf(charCounts, ',') > 10) {
162          // no points and many commas
163          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
164          int tokensWithMultipleCommas = 0;
165          for (int i = 0; i < charsRead; i++) {
166            int nCommas = 0;
167            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
168              if (buffer[i] == ',') nCommas++;
169              i++;
170            }
171            if (nCommas > 2) tokensWithMultipleCommas++;
172          }
173          if (tokensWithMultipleCommas > 1) {
174            // English format (only integer values) with ',' as separator
175            numberFormat = NumberFormatInfo.InvariantInfo;
176            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
177            separator = ',';
178          } else {
179            char[] disallowedSeparators = new char[] { ',' };
180            // German format (real values)
181            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
182            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
183            separator = POSSIBLE_SEPARATORS
184              .Except(disallowedSeparators)
185              .Where(c => OccurrencesOf(charCounts, c) > 10)
186              .OrderBy(c => -OccurrencesOf(charCounts, c))
187              .DefaultIfEmpty(' ')
188              .First();
189          }
190        } else {
191          // no points and no commas => English format
192          numberFormat = NumberFormatInfo.InvariantInfo;
193          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
194          separator = POSSIBLE_SEPARATORS
195            .Where(c => OccurrencesOf(charCounts, c) > 10)
196            .OrderBy(c => -OccurrencesOf(charCounts, c))
197            .DefaultIfEmpty(' ')
198            .First();
199        }
200      }
201    }
202
203    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
204      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
205    }
206
207    #region tokenizer
208    internal enum TokenTypeEnum {
209      NewLine, Separator, String, Double, DateTime
210    }
211
212    internal class Token {
213      public TokenTypeEnum type;
214      public string stringValue;
215      public double doubleValue;
216      public DateTime dateTimeValue;
217
218      public Token(TokenTypeEnum type, string value) {
219        this.type = type;
220        stringValue = value;
221        dateTimeValue = DateTime.MinValue;
222        doubleValue = 0.0;
223      }
224
225      public override string ToString() {
226        return stringValue;
227      }
228    }
229
230
231    internal class Tokenizer {
232      private StreamReader reader;
233      private List<Token> tokens;
234      private NumberFormatInfo numberFormatInfo;
235      private DateTimeFormatInfo dateTimeFormatInfo;
236      private char separator;
237      private const string INTERNAL_SEPARATOR = "#";
238
239      private int currentLineNumber = 0;
240      public int CurrentLineNumber {
241        get { return currentLineNumber; }
242        private set { currentLineNumber = value; }
243      }
244      private string currentLine;
245      public string CurrentLine {
246        get { return currentLine; }
247        private set { currentLine = value; }
248      }
249
250      private Token newlineToken;
251      public Token NewlineToken {
252        get { return newlineToken; }
253        private set { newlineToken = value; }
254      }
255      private Token separatorToken;
256      public Token SeparatorToken {
257        get { return separatorToken; }
258        private set { separatorToken = value; }
259      }
260
261      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
262        this.reader = reader;
263        this.numberFormatInfo = numberFormatInfo;
264        this.dateTimeFormatInfo = dateTimeFormatInfo;
265        this.separator = separator;
266        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
267        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
268        tokens = new List<Token>();
269        ReadNextTokens();
270      }
271
272      private void ReadNextTokens() {
273        if (!reader.EndOfStream) {
274          CurrentLine = reader.ReadLine();
275          var newTokens = from str in Split(CurrentLine)
276                          let trimmedStr = str.Trim()
277                          where !string.IsNullOrEmpty(trimmedStr)
278                          select MakeToken(trimmedStr);
279
280          tokens.AddRange(newTokens);
281          tokens.Add(NewlineToken);
282          CurrentLineNumber++;
283        }
284      }
285
286      private IEnumerable<string> Split(string line) {
287        StringBuilder subStr = new StringBuilder();
288        foreach (char c in line) {
289          if (c == separator) {
290            yield return subStr.ToString();
291            subStr = new StringBuilder();
292            // all separator characters are transformed to the internally used separator character
293            yield return INTERNAL_SEPARATOR;
294          } else {
295            subStr.Append(c);
296          }
297        }
298        yield return subStr.ToString();
299      }
300
301      private Token MakeToken(string strToken) {
302        Token token = new Token(TokenTypeEnum.String, strToken);
303        if (strToken.Equals(INTERNAL_SEPARATOR)) {
304          return SeparatorToken;
305        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
306          token.type = TokenTypeEnum.Double;
307          return token;
308        } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
309          token.type = TokenTypeEnum.DateTime;
310          return token;
311        }
312
313        // couldn't parse the token as an int or float number  or datetime value so return a string token
314        return token;
315      }
316
317      public Token Peek() {
318        return tokens[0];
319      }
320
321      public Token Next() {
322        Token next = tokens[0];
323        tokens.RemoveAt(0);
324        if (tokens.Count == 0) {
325          ReadNextTokens();
326        }
327        return next;
328      }
329
330      public bool HasNext() {
331        return tokens.Count > 0 || !reader.EndOfStream;
332      }
333    }
334    #endregion
335
336    #region parsing
337    private void Parse() {
338      ParseVariableNames();
339      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
340      ParseValues();
341      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
342    }
343
344    private void ParseValues() {
345      while (tokenizer.HasNext()) {
346        if (tokenizer.Peek() == tokenizer.NewlineToken) {
347          tokenizer.Next();
348        } else {
349          List<object> row = new List<object>();
350          object value = NextValue(tokenizer);
351          row.Add(value);
352          while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
353            Expect(tokenizer.SeparatorToken);
354            row.Add(NextValue(tokenizer));
355          }
356          Expect(tokenizer.NewlineToken);
357          // all rows have to have the same number of values           
358          // the first row defines how many samples are needed
359          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
360            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
361                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
362                  tokenizer.CurrentLineNumber);
363          }
364          rowValues.Add(row);
365        }
366      }
367    }
368
369    private object NextValue(Tokenizer tokenizer) {
370      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
371      Token current = tokenizer.Next();
372      if (current.type == TokenTypeEnum.Separator) {
373        return double.NaN;
374      } else if (current.type == TokenTypeEnum.String) {
375        return current.stringValue;
376      } else if (current.type == TokenTypeEnum.Double) {
377        return current.doubleValue;
378      } else if (current.type == TokenTypeEnum.DateTime) {
379        return current.dateTimeValue;
380      }
381      // found an unexpected token => throw error
382      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
383      // this line is never executed because Error() throws an exception
384      throw new InvalidOperationException();
385    }
386
387    private void ParseVariableNames() {
388      //if first token is double no variables names are given
389      if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
390
391      // the first line must contain variable names
392      List<Token> tokens = new List<Token>();
393      Token valueToken;
394      valueToken = tokenizer.Next();
395      tokens.Add(valueToken);
396      while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
397        Expect(tokenizer.SeparatorToken);
398        valueToken = tokenizer.Next();
399        if (valueToken != tokenizer.NewlineToken) {
400          tokens.Add(valueToken);
401        }
402      }
403      if (valueToken != tokenizer.NewlineToken) {
404        Expect(tokenizer.NewlineToken);
405      }
406      variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
407    }
408
409    private void Expect(Token expectedToken) {
410      Token actualToken = tokenizer.Next();
411      if (actualToken != expectedToken) {
412        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
413      }
414    }
415
416    private void Error(string message, string token, int lineNumber) {
417      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
418    }
419    #endregion
420
421    [Serializable]
422    private class DataFormatException : Exception {
423      private int line;
424      public int Line {
425        get { return line; }
426      }
427      private string token;
428      public string Token {
429        get { return token; }
430      }
431      public DataFormatException(string message, string token, int line)
432        : base(message + "\nToken: " + token + " (line: " + line + ")") {
433        this.token = token;
434        this.line = line;
435      }
436
437      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
438    }
439  }
440}
Note: See TracBrowser for help on using the repository browser.