Free cookie consent management tool by TermsFeed Policy Generator

source: branches/PersistenceSpeedUp/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 9760

Last change on this file since 9760 was 6760, checked in by epitzer, 13 years ago

#1530 integrate changes from trunk

File size: 16.2 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
[5445]3 * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[6760]23using System.Collections;
[2]24using System.Collections.Generic;
25using System.Globalization;
26using System.IO;
[2446]27using System.Linq;
[5484]28using System.Runtime.Serialization;
[2446]29using System.Text;
[2]30
[3373]31namespace HeuristicLab.Problems.DataAnalysis {
[5013]32  public class TableFileParser {
33    private const int BUFFER_SIZE = 1024;
34    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
[2]35    private Tokenizer tokenizer;
[6760]36    private List<List<object>> rowValues;
[2]37
38    private int rows;
39    public int Rows {
40      get { return rows; }
41      set { rows = value; }
42    }
43
44    private int columns;
45    public int Columns {
46      get { return columns; }
47      set { columns = value; }
48    }
49
[6760]50    private List<IList> values;
51    public List<IList> Values {
[2]52      get {
[3264]53        return values;
[2]54      }
55    }
56
[5369]57    private List<string> variableNames;
[3264]58    public IEnumerable<string> VariableNames {
[2]59      get {
[3264]60        if (variableNames.Count > 0) return variableNames;
61        else {
[273]62          string[] names = new string[columns];
[1221]63          for (int i = 0; i < names.Length; i++) {
[273]64            names[i] = "X" + i.ToString("000");
65          }
66          return names;
[2]67        }
68      }
69    }
70
[5013]71    public TableFileParser() {
[6760]72      rowValues = new List<List<object>>();
[3264]73      variableNames = new List<string>();
[2]74    }
75
[3264]76    public void Parse(string fileName) {
[5013]77      NumberFormatInfo numberFormat;
[6760]78      DateTimeFormatInfo dateTimeFormatInfo;
[5013]79      char separator;
[6760]80      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
[5013]81      using (StreamReader reader = new StreamReader(fileName)) {
[6760]82        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[5013]83        // parse the file
84        Parse();
85      }
86
[2]87      // translate the list of samples into a DoubleMatrixData item
[3264]88      rows = rowValues.Count;
89      columns = rowValues[0].Count;
[6760]90      values = new List<IList>();
[2]91
[6760]92      //create columns
93      for (int col = 0; col < columns; col++) {
94        var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
95        if (!types.Any()) {
96          values.Add(new List<string>());
97          continue;
[2]98        }
[6760]99
100        var columnType = types.GroupBy(v => v).OrderBy(v => v).Last().Key;
101        if (columnType == typeof(double)) values.Add(new List<double>());
102        else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
103        else if (columnType == typeof(string)) values.Add(new List<string>());
104        else throw new InvalidOperationException();
[2]105      }
[6760]106
107
108
109      //fill with values
110      foreach (List<object> row in rowValues) {
111        int columnIndex = 0;
112        foreach (object element in row) {
113          //handle missing values with default values
114          if (element as string == string.Empty) {
115            if (values[columnIndex] is List<double>) values[columnIndex].Add(double.NaN);
116            else if (values[columnIndex] is List<DateTime>) values[columnIndex].Add(DateTime.MinValue);
117            else if (values[columnIndex] is List<string>) values[columnIndex].Add(string.Empty);
118            else throw new InvalidOperationException();
119          } else values[columnIndex].Add(element);
120          columnIndex++;
121        }
122      }
[2]123    }
124
[6760]125    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
[5013]126      using (StreamReader reader = new StreamReader(fileName)) {
127        // skip first line
128        reader.ReadLine();
129        // read a block
130        char[] buffer = new char[BUFFER_SIZE];
131        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
132        // count frequency of special characters
133        Dictionary<char, int> charCounts = buffer.Take(charsRead)
134          .GroupBy(c => c)
135          .ToDictionary(g => g.Key, g => g.Count());
136
137        // depending on the characters occuring in the block
138        // we distinghish a number of different cases based on the the following rules:
139        // many points => it must be English number format, the other frequently occuring char is the separator
140        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
141        //   => check the line in more detail:
142        //            English: 0, 0, 0, 0
143        //            German:  0,0 0,0 0,0 ...
144        //            => if commas are followed by space => English format
145        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
146        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
147        if (OccurrencesOf(charCounts, '.') > 10) {
148          numberFormat = NumberFormatInfo.InvariantInfo;
[6760]149          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
[5013]150          separator = POSSIBLE_SEPARATORS
151            .Where(c => OccurrencesOf(charCounts, c) > 10)
152            .OrderBy(c => -OccurrencesOf(charCounts, c))
153            .DefaultIfEmpty(' ')
154            .First();
155        } else if (OccurrencesOf(charCounts, ',') > 10) {
156          // no points and many commas
157          int countCommaNonDigitPairs = 0;
158          for (int i = 0; i < charsRead - 1; i++) {
159            if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
160              countCommaNonDigitPairs++;
161            }
[1221]162          }
[5013]163          if (countCommaNonDigitPairs > 10) {
164            // English format (only integer values) with ',' as separator
165            numberFormat = NumberFormatInfo.InvariantInfo;
[6760]166            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
[5013]167            separator = ',';
168          } else {
169            char[] disallowedSeparators = new char[] { ',' };
170            // German format (real values)
[5096]171            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
[6760]172            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
[5013]173            separator = POSSIBLE_SEPARATORS
174              .Except(disallowedSeparators)
175              .Where(c => OccurrencesOf(charCounts, c) > 10)
176              .OrderBy(c => -OccurrencesOf(charCounts, c))
177              .DefaultIfEmpty(' ')
178              .First();
[405]179          }
[5013]180        } else {
181          // no points and no commas => English format
182          numberFormat = NumberFormatInfo.InvariantInfo;
[6760]183          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
[5013]184          separator = POSSIBLE_SEPARATORS
185            .Where(c => OccurrencesOf(charCounts, c) > 10)
186            .OrderBy(c => -OccurrencesOf(charCounts, c))
187            .DefaultIfEmpty(' ')
188            .First();
[405]189        }
190      }
191    }
192
[5013]193    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
194      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
195    }
196
[2]197    #region tokenizer
198    internal enum TokenTypeEnum {
[6760]199      NewLine, Separator, String, Double, DateTime
[2]200    }
201
202    internal class Token {
203      public TokenTypeEnum type;
204      public string stringValue;
205      public double doubleValue;
[6760]206      public DateTime dateTimeValue;
[2]207
208      public Token(TokenTypeEnum type, string value) {
209        this.type = type;
210        stringValue = value;
[6760]211        dateTimeValue = DateTime.MinValue;
[2]212        doubleValue = 0.0;
213      }
214
215      public override string ToString() {
216        return stringValue;
217      }
218    }
219
220
[3264]221    internal class Tokenizer {
[2]222      private StreamReader reader;
223      private List<Token> tokens;
[405]224      private NumberFormatInfo numberFormatInfo;
[6760]225      private DateTimeFormatInfo dateTimeFormatInfo;
[5013]226      private char separator;
227      private const string INTERNAL_SEPARATOR = "#";
[2]228
[3264]229      private int currentLineNumber = 0;
230      public int CurrentLineNumber {
231        get { return currentLineNumber; }
232        private set { currentLineNumber = value; }
233      }
234      private string currentLine;
235      public string CurrentLine {
236        get { return currentLine; }
237        private set { currentLine = value; }
238      }
[2]239
[3264]240      private Token newlineToken;
241      public Token NewlineToken {
242        get { return newlineToken; }
243        private set { newlineToken = value; }
244      }
245      private Token separatorToken;
246      public Token SeparatorToken {
247        get { return separatorToken; }
248        private set { separatorToken = value; }
249      }
[2]250
[6760]251      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
[2]252        this.reader = reader;
[405]253        this.numberFormatInfo = numberFormatInfo;
[6760]254        this.dateTimeFormatInfo = dateTimeFormatInfo;
[5013]255        this.separator = separator;
256        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
[3264]257        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]258        tokens = new List<Token>();
259        ReadNextTokens();
260      }
261
262      private void ReadNextTokens() {
[1221]263        if (!reader.EndOfStream) {
[2]264          CurrentLine = reader.ReadLine();
[2446]265          var newTokens = from str in Split(CurrentLine)
266                          let trimmedStr = str.Trim()
267                          where !string.IsNullOrEmpty(trimmedStr)
[5013]268                          select MakeToken(trimmedStr);
[2]269
[2446]270          tokens.AddRange(newTokens);
[2]271          tokens.Add(NewlineToken);
272          CurrentLineNumber++;
273        }
274      }
275
[2446]276      private IEnumerable<string> Split(string line) {
277        StringBuilder subStr = new StringBuilder();
278        foreach (char c in line) {
[5013]279          if (c == separator) {
[2446]280            yield return subStr.ToString();
281            subStr = new StringBuilder();
[5013]282            // all separator characters are transformed to the internally used separator character
283            yield return INTERNAL_SEPARATOR;
[2446]284          } else {
285            subStr.Append(c);
286          }
287        }
288        yield return subStr.ToString();
289      }
290
[2]291      private Token MakeToken(string strToken) {
[406]292        Token token = new Token(TokenTypeEnum.String, strToken);
[5013]293        if (strToken.Equals(INTERNAL_SEPARATOR)) {
[2446]294          return SeparatorToken;
[1221]295        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]296          token.type = TokenTypeEnum.Double;
297          return token;
[6760]298        } else if (DateTime.TryParse(strToken, out token.dateTimeValue)) {
299          token.type = TokenTypeEnum.DateTime;
300          return token;
[2]301        }
[2446]302
[6760]303        // couldn't parse the token as an int or float number  or datetime value so return a string token
[406]304        return token;
[2]305      }
306
307      public Token Peek() {
308        return tokens[0];
309      }
310
311      public Token Next() {
312        Token next = tokens[0];
313        tokens.RemoveAt(0);
[1221]314        if (tokens.Count == 0) {
[2]315          ReadNextTokens();
316        }
317        return next;
318      }
319
320      public bool HasNext() {
321        return tokens.Count > 0 || !reader.EndOfStream;
322      }
323    }
324    #endregion
325
326    #region parsing
[3264]327    private void Parse() {
328      ParseVariableNames();
[1221]329      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]330      ParseValues();
331      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]332    }
333
[3264]334    private void ParseValues() {
[1221]335      while (tokenizer.HasNext()) {
[6760]336        if (tokenizer.Peek() == tokenizer.NewlineToken) {
337          tokenizer.Next();
338        } else {
339          List<object> row = new List<object>();
340          object value = NextValue(tokenizer);
341          row.Add(value);
342          while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
343            Expect(tokenizer.SeparatorToken);
344            row.Add(NextValue(tokenizer));
345          }
346          Expect(tokenizer.NewlineToken);
347          // all rows have to have the same number of values           
348          // the first row defines how many samples are needed
349          if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
350            Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
351                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
352                  tokenizer.CurrentLineNumber);
353          }
354          rowValues.Add(row);
[2446]355        }
356      }
357    }
358
[6760]359    private object NextValue(Tokenizer tokenizer) {
360      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
[2446]361      Token current = tokenizer.Next();
[6760]362      if (current.type == TokenTypeEnum.Separator) {
[2446]363        return double.NaN;
[6760]364      } else if (current.type == TokenTypeEnum.String) {
365        return current.stringValue;
[2446]366      } else if (current.type == TokenTypeEnum.Double) {
367        return current.doubleValue;
[6760]368      } else if (current.type == TokenTypeEnum.DateTime) {
369        return current.dateTimeValue;
[2]370      }
[3264]371      // found an unexpected token => throw error
372      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
373      // this line is never executed because Error() throws an exception
374      throw new InvalidOperationException();
[2]375    }
376
[3264]377    private void ParseVariableNames() {
[6760]378      //if first token is double no variables names are given
379      if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
[2446]380
[6760]381      // the first line must contain variable names
382      List<Token> tokens = new List<Token>();
383      Token valueToken;
384      valueToken = tokenizer.Next();
385      tokens.Add(valueToken);
386      while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
387        Expect(tokenizer.SeparatorToken);
[1221]388        valueToken = tokenizer.Next();
[3264]389        if (valueToken != tokenizer.NewlineToken) {
[6760]390          tokens.Add(valueToken);
[2446]391        }
[2]392      }
[6760]393      if (valueToken != tokenizer.NewlineToken) {
394        Expect(tokenizer.NewlineToken);
395      }
396      variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]397    }
398
399    private void Expect(Token expectedToken) {
400      Token actualToken = tokenizer.Next();
[1221]401      if (actualToken != expectedToken) {
[273]402        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]403      }
404    }
405
[273]406    private void Error(string message, string token, int lineNumber) {
407      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]408    }
409    #endregion
[5484]410
411    [Serializable]
412    private class DataFormatException : Exception {
413      private int line;
414      public int Line {
415        get { return line; }
416      }
417      private string token;
418      public string Token {
419        get { return token; }
420      }
421      public DataFormatException(string message, string token, int line)
422        : base(message + "\nToken: " + token + " (line: " + line + ")") {
423        this.token = token;
424        this.line = line;
425      }
426
427      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
428    }
[2]429  }
430}
Note: See TracBrowser for help on using the repository browser.