Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HiveHiveEngine/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 12467

Last change on this file since 12467 was 7259, checked in by swagner, 13 years ago

Updated year of copyrights to 2012 (#1716)

File size: 16.4 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections;
24using System.Collections.Generic;
25using System.Globalization;
26using System.IO;
27using System.Linq;
28using System.Runtime.Serialization;
29using System.Text;
30
31namespace HeuristicLab.Problems.DataAnalysis {
32  public class TableFileParser {
33    private const int BUFFER_SIZE = 1024;
34    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
35    private Tokenizer tokenizer;
36    private List<List<object>> rowValues;
37
38    private int rows;
39    public int Rows {
40      get { return rows; }
41      set { rows = value; }
42    }
43
44    private int columns;
45    public int Columns {
46      get { return columns; }
47      set { columns = value; }
48    }
49
50    private List<IList> values;
51    public List<IList> Values {
52      get {
53        return values;
54      }
55    }
56
57    private List<string> variableNames;
58    public IEnumerable<string> VariableNames {
59      get {
60        if (variableNames.Count > 0) return variableNames;
61        else {
62          string[] names = new string[columns];
63          for (int i = 0; i < names.Length; i++) {
64            names[i] = "X" + i.ToString("000");
65          }
66          return names;
67        }
68      }
69    }
70
71    public TableFileParser() {
72      rowValues = new List<List<object>>();
73      variableNames = new List<string>();
74    }
75
76    public void Parse(string fileName) {
77      NumberFormatInfo numberFormat;
78      DateTimeFormatInfo dateTimeFormatInfo;
79      char separator;
80      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
81      using (StreamReader reader = new StreamReader(fileName)) {
82        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
83        // parse the file
84        Parse();
85      }
86
87      // translate the list of samples into a DoubleMatrixData item
88      rows = rowValues.Count;
89      columns = rowValues[0].Count;
90      values = new List<IList>();
91
92      //create columns
93      for (int col = 0; col < columns; col++) {
94        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
95        if (!types.Any()) {
96          values.Add(new List<string>());
97          continue;
98        }
99
100        var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
101        if (columnType == typeof(double)) values.Add(new List<double>());
102        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
103        else if (columnType == typeof(string)) values.Add(new List<string>());
104        else throw new InvalidOperationException();
105      }
106
107
108
109      //fill with values
110      foreach (List<object> row in rowValues) {
111        int columnIndex = 0;
112        foreach (object element in row) {
113          if (values[columnIndex] is List<double> && !(element is double))
114            values[columnIndex].Add(double.NaN);
115          else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
116            values[columnIndex].Add(DateTime.MinValue);
117          else if (values[columnIndex] is List<string> && !(element is string))
118            values[columnIndex].Add(string.Empty);
119          else
120            values[columnIndex].Add(element);
121          columnIndex++;
122        }
123      }
124    }
125
126    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
127      using (StreamReader reader = new StreamReader(fileName)) {
128        // skip first line
129        reader.ReadLine();
130        // read a block
131        char[] buffer = new char[BUFFER_SIZE];
132        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
133        // count frequency of special characters
134        Dictionary<char, int> charCounts = buffer.Take(charsRead)
135          .GroupBy(c => c)
136          .ToDictionary(g => g.Key, g => g.Count());
137
138        // depending on the characters occuring in the block
139        // we distinghish a number of different cases based on the the following rules:
140        // many points => it must be English number format, the other frequently occuring char is the separator
141        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
142        //   => check the line in more detail:
143        //            English: 0, 0, 0, 0
144        //            German:  0,0 0,0 0,0 ...
145        //            => if commas are followed by space => English format
146        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
147        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
148        if (OccurrencesOf(charCounts, '.') > 10) {
149          numberFormat = NumberFormatInfo.InvariantInfo;
150          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
151          separator = POSSIBLE_SEPARATORS
152            .Where(c => OccurrencesOf(charCounts, c) > 10)
153            .OrderBy(c => -OccurrencesOf(charCounts, c))
154            .DefaultIfEmpty(' ')
155            .First();
156        } else if (OccurrencesOf(charCounts, ',') > 10) {
157          // no points and many commas
158          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
159          int tokensWithMultipleCommas = 0;
160          for (int i = 0; i < charsRead; i++) {
161            int nCommas = 0;
162            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
163              if (buffer[i] == ',') nCommas++;
164              i++;
165            }
166            if (nCommas > 2) tokensWithMultipleCommas++;
167          }
168          if (tokensWithMultipleCommas > 1) {
169            // English format (only integer values) with ',' as separator
170            numberFormat = NumberFormatInfo.InvariantInfo;
171            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
172            separator = ',';
173          } else {
174            char[] disallowedSeparators = new char[] { ',' };
175            // German format (real values)
176            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
177            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
178            separator = POSSIBLE_SEPARATORS
179              .Except(disallowedSeparators)
180              .Where(c => OccurrencesOf(charCounts, c) > 10)
181              .OrderBy(c => -OccurrencesOf(charCounts, c))
182              .DefaultIfEmpty(' ')
183              .First();
184          }
185        } else {
186          // no points and no commas => English format
187          numberFormat = NumberFormatInfo.InvariantInfo;
188          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
189          separator = POSSIBLE_SEPARATORS
190            .Where(c => OccurrencesOf(charCounts, c) > 10)
191            .OrderBy(c => -OccurrencesOf(charCounts, c))
192            .DefaultIfEmpty(' ')
193            .First();
194        }
195      }
196    }
197
198    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
199      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
200    }
201
202    #region tokenizer
203    internal enum TokenTypeEnum {
204      NewLine, Separator, String, Double, DateTime
205    }
206
207    internal class Token {
208      public TokenTypeEnum type;
209      public string stringValue;
210      public double doubleValue;
211      public DateTime dateTimeValue;
212
213      public Token(TokenTypeEnum type, string value) {
214        this.type = type;
215        stringValue = value;
216        dateTimeValue = DateTime.MinValue;
217        doubleValue = 0.0;
218      }
219
220      public override string ToString() {
221        return stringValue;
222      }
223    }
224
225
226    internal class Tokenizer {
227      private StreamReader reader;
228      private List<Token> tokens;
229      private NumberFormatInfo numberFormatInfo;
230      private DateTimeFormatInfo dateTimeFormatInfo;
231      private char separator;
232      private const string INTERNAL_SEPARATOR = "#";
233
234      private int currentLineNumber = 0;
235      public int CurrentLineNumber {
236        get { return currentLineNumber; }
237        private set { currentLineNumber = value; }
238      }
239      private string currentLine;
240      public string CurrentLine {
241        get { return currentLine; }
242        private set { currentLine = value; }
243      }
244
245      private Token newlineToken;
246      public Token NewlineToken {
247        get { return newlineToken; }
248        private set { newlineToken = value; }
249      }
250      private Token separatorToken;
251      public Token SeparatorToken {
252        get { return separatorToken; }
253        private set { separatorToken = value; }
254      }
255
256      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
257        this.reader = reader;
258        this.numberFormatInfo = numberFormatInfo;
259        this.dateTimeFormatInfo = dateTimeFormatInfo;
260        this.separator = separator;
261        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
262        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
263        tokens = new List<Token>();
264        ReadNextTokens();
265      }
266
267      private void ReadNextTokens() {
268        if (!reader.EndOfStream) {
269          CurrentLine = reader.ReadLine();
270          var newTokens = from str in Split(CurrentLine)
271                          let trimmedStr = str.Trim()
272                          where !string.IsNullOrEmpty(trimmedStr)
273                          select MakeToken(trimmedStr);
274
275          tokens.AddRange(newTokens);
276          tokens.Add(NewlineToken);
277          CurrentLineNumber++;
278        }
279      }
280
281      private IEnumerable<string> Split(string line) {
282        StringBuilder subStr = new StringBuilder();
283        foreach (char c in line) {
284          if (c == separator) {
285            yield return subStr.ToString();
286            subStr = new StringBuilder();
287            // all separator characters are transformed to the internally used separator character
288            yield return INTERNAL_SEPARATOR;
289          } else {
290            subStr.Append(c);
291          }
292        }
293        yield return subStr.ToString();
294      }
295
296      private Token MakeToken(string strToken) {
297        Token token = new Token(TokenTypeEnum.String, strToken);
298        if (strToken.Equals(INTERNAL_SEPARATOR)) {
299          return SeparatorToken;
300        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
301          token.type = TokenTypeEnum.Double;
302          return token;
303        } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
304          token.type = TokenTypeEnum.DateTime;
305          return token;
306        }
307
308        // couldn't parse the token as an int or float number  or datetime value so return a string token
309        return token;
310      }
311
312      public Token Peek() {
313        return tokens[0];
314      }
315
316      public Token Next() {
317        Token next = tokens[0];
318        tokens.RemoveAt(0);
319        if (tokens.Count == 0) {
320          ReadNextTokens();
321        }
322        return next;
323      }
324
325      public bool HasNext() {
326        return tokens.Count > 0 || !reader.EndOfStream;
327      }
328    }
329    #endregion
330
331    #region parsing
332    private void Parse() {
333      ParseVariableNames();
334      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
335      ParseValues();
336      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
337    }
338
339    private void ParseValues() {
340      while (tokenizer.HasNext()) {
341        if (tokenizer.Peek() == tokenizer.NewlineToken) {
342          tokenizer.Next();
343        } else {
344          List<object> row = new List<object>();
345          object value = NextValue(tokenizer);
346          row.Add(value);
347          while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
348            Expect(tokenizer.SeparatorToken);
349            row.Add(NextValue(tokenizer));
350          }
351          Expect(tokenizer.NewlineToken);
352          // all rows have to have the same number of values           
353          // the first row defines how many samples are needed
354          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
355            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
356                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
357                  tokenizer.CurrentLineNumber);
358          }
359          rowValues.Add(row);
360        }
361      }
362    }
363
364    private object NextValue(Tokenizer tokenizer) {
365      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
366      Token current = tokenizer.Next();
367      if (current.type == TokenTypeEnum.Separator) {
368        return double.NaN;
369      } else if (current.type == TokenTypeEnum.String) {
370        return current.stringValue;
371      } else if (current.type == TokenTypeEnum.Double) {
372        return current.doubleValue;
373      } else if (current.type == TokenTypeEnum.DateTime) {
374        return current.dateTimeValue;
375      }
376      // found an unexpected token => throw error
377      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
378      // this line is never executed because Error() throws an exception
379      throw new InvalidOperationException();
380    }
381
382    private void ParseVariableNames() {
383      //if first token is double no variables names are given
384      if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
385
386      // the first line must contain variable names
387      List<Token> tokens = new List<Token>();
388      Token valueToken;
389      valueToken = tokenizer.Next();
390      tokens.Add(valueToken);
391      while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
392        Expect(tokenizer.SeparatorToken);
393        valueToken = tokenizer.Next();
394        if (valueToken != tokenizer.NewlineToken) {
395          tokens.Add(valueToken);
396        }
397      }
398      if (valueToken != tokenizer.NewlineToken) {
399        Expect(tokenizer.NewlineToken);
400      }
401      variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
402    }
403
404    private void Expect(Token expectedToken) {
405      Token actualToken = tokenizer.Next();
406      if (actualToken != expectedToken) {
407        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
408      }
409    }
410
411    private void Error(string message, string token, int lineNumber) {
412      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
413    }
414    #endregion
415
416    [Serializable]
417    private class DataFormatException : Exception {
418      private int line;
419      public int Line {
420        get { return line; }
421      }
422      private string token;
423      public string Token {
424        get { return token; }
425      }
426      public DataFormatException(string message, string token, int line)
427        : base(message + "\nToken: " + token + " (line: " + line + ")") {
428        this.token = token;
429        this.line = line;
430      }
431
432      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
433    }
434  }
435}
Note: See TracBrowser for help on using the repository browser.