Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs @ 11247

Last change on this file since 11247 was 5275, checked in by gkronber, 14 years ago

Merged changes from trunk to data analysis exploration branch and added fractional distance metric evaluator. #1142

File size: 13.6 KB
RevLine 
[5275]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using System.Linq;
27using System.Text;
28
29namespace HeuristicLab.Problems.DataAnalysis {
30  public class TableFileParser {
31    private const int BUFFER_SIZE = 1024;
32    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
33    private const string VARIABLENAMES = "VARIABLENAMES";
34    private Tokenizer tokenizer;
35    private List<string> variableNames;
36    private List<List<double>> rowValues;
37
38    private int rows;
39    public int Rows {
40      get { return rows; }
41      set { rows = value; }
42    }
43
44    private int columns;
45    public int Columns {
46      get { return columns; }
47      set { columns = value; }
48    }
49
50    private double[,] values;
51    public double[,] Values {
52      get {
53        return values;
54      }
55    }
56
57    public IEnumerable<string> VariableNames {
58      get {
59        if (variableNames.Count > 0) return variableNames;
60        else {
61          string[] names = new string[columns];
62          for (int i = 0; i < names.Length; i++) {
63            names[i] = "X" + i.ToString("000");
64          }
65          return names;
66        }
67      }
68    }
69
70    public TableFileParser() {
71      rowValues = new List<List<double>>();
72      variableNames = new List<string>();
73    }
74
75    private void Reset() {
76      variableNames.Clear();
77      rowValues.Clear();
78    }
79
80    public void Parse(string fileName) {
81      NumberFormatInfo numberFormat;
82      char separator;
83      DetermineFileFormat(fileName, out numberFormat, out separator);
84      using (StreamReader reader = new StreamReader(fileName)) {
85        tokenizer = new Tokenizer(reader, numberFormat, separator);
86        // parse the file
87        Parse();
88      }
89
90      // translate the list of samples into a DoubleMatrixData item
91      rows = rowValues.Count;
92      columns = rowValues[0].Count;
93      values = new double[rows, columns];
94
95      int rowIndex = 0;
96      int columnIndex = 0;
97      foreach (List<double> row in rowValues) {
98        columnIndex = 0;
99        foreach (double element in row) {
100          values[rowIndex, columnIndex++] = element;
101        }
102        rowIndex++;
103      }
104    }
105
106    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
107      using (StreamReader reader = new StreamReader(fileName)) {
108        // skip first line
109        reader.ReadLine();
110        // read a block
111        char[] buffer = new char[BUFFER_SIZE];
112        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
113        // count frequency of special characters
114        Dictionary<char, int> charCounts = buffer.Take(charsRead)
115          .GroupBy(c => c)
116          .ToDictionary(g => g.Key, g => g.Count());
117
118        // depending on the characters occuring in the block
119        // we distinghish a number of different cases based on the the following rules:
120        // many points => it must be English number format, the other frequently occuring char is the separator
121        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
122        //   => check the line in more detail:
123        //            English: 0, 0, 0, 0
124        //            German:  0,0 0,0 0,0 ...
125        //            => if commas are followed by space => English format
126        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
127        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
128        if (OccurrencesOf(charCounts, '.') > 10) {
129          numberFormat = NumberFormatInfo.InvariantInfo;
130          separator = POSSIBLE_SEPARATORS
131            .Where(c => OccurrencesOf(charCounts, c) > 10)
132            .OrderBy(c => -OccurrencesOf(charCounts, c))
133            .DefaultIfEmpty(' ')
134            .First();
135        } else if (OccurrencesOf(charCounts, ',') > 10) {
136          // no points and many commas
137          int countCommaNonDigitPairs = 0;
138          for (int i = 0; i < charsRead - 1; i++) {
139            if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
140              countCommaNonDigitPairs++;
141            }
142          }
143          if (countCommaNonDigitPairs > 10) {
144            // English format (only integer values) with ',' as separator
145            numberFormat = NumberFormatInfo.InvariantInfo;
146            separator = ',';
147          } else {
148            char[] disallowedSeparators = new char[] { ',' };
149            // German format (real values)
150            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
151            separator = POSSIBLE_SEPARATORS
152              .Except(disallowedSeparators)
153              .Where(c => OccurrencesOf(charCounts, c) > 10)
154              .OrderBy(c => -OccurrencesOf(charCounts, c))
155              .DefaultIfEmpty(' ')
156              .First();
157          }
158        } else {
159          // no points and no commas => English format
160          numberFormat = NumberFormatInfo.InvariantInfo;
161          separator = POSSIBLE_SEPARATORS
162            .Where(c => OccurrencesOf(charCounts, c) > 10)
163            .OrderBy(c => -OccurrencesOf(charCounts, c))
164            .DefaultIfEmpty(' ')
165            .First();
166        }
167      }
168    }
169
170    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
171      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
172    }
173
174    #region tokenizer
175    internal enum TokenTypeEnum {
176      NewLine, Separator, String, Double
177    }
178
179    internal class Token {
180      public TokenTypeEnum type;
181      public string stringValue;
182      public double doubleValue;
183
184      public Token(TokenTypeEnum type, string value) {
185        this.type = type;
186        stringValue = value;
187        doubleValue = 0.0;
188      }
189
190      public override string ToString() {
191        return stringValue;
192      }
193    }
194
195
196    internal class Tokenizer {
197      private StreamReader reader;
198      private List<Token> tokens;
199      private NumberFormatInfo numberFormatInfo;
200      private char separator;
201      private const string INTERNAL_SEPARATOR = "#";
202
203      private int currentLineNumber = 0;
204      public int CurrentLineNumber {
205        get { return currentLineNumber; }
206        private set { currentLineNumber = value; }
207      }
208      private string currentLine;
209      public string CurrentLine {
210        get { return currentLine; }
211        private set { currentLine = value; }
212      }
213
214      private Token newlineToken;
215      public Token NewlineToken {
216        get { return newlineToken; }
217        private set { newlineToken = value; }
218      }
219      private Token separatorToken;
220      public Token SeparatorToken {
221        get { return separatorToken; }
222        private set { separatorToken = value; }
223      }
224
225      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
226        this.reader = reader;
227        this.numberFormatInfo = numberFormatInfo;
228        this.separator = separator;
229        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
230        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
231        tokens = new List<Token>();
232        ReadNextTokens();
233      }
234
235      private void ReadNextTokens() {
236        if (!reader.EndOfStream) {
237          CurrentLine = reader.ReadLine();
238          var newTokens = from str in Split(CurrentLine)
239                          let trimmedStr = str.Trim()
240                          where !string.IsNullOrEmpty(trimmedStr)
241                          select MakeToken(trimmedStr);
242
243          tokens.AddRange(newTokens);
244          tokens.Add(NewlineToken);
245          CurrentLineNumber++;
246        }
247      }
248
249      private IEnumerable<string> Split(string line) {
250        StringBuilder subStr = new StringBuilder();
251        foreach (char c in line) {
252          if (c == separator) {
253            yield return subStr.ToString();
254            subStr = new StringBuilder();
255            // all separator characters are transformed to the internally used separator character
256            yield return INTERNAL_SEPARATOR;
257          } else {
258            subStr.Append(c);
259          }
260        }
261        yield return subStr.ToString();
262      }
263
264      private Token MakeToken(string strToken) {
265        Token token = new Token(TokenTypeEnum.String, strToken);
266        if (strToken.Equals(INTERNAL_SEPARATOR)) {
267          return SeparatorToken;
268        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
269          token.type = TokenTypeEnum.Double;
270          return token;
271        }
272
273        // couldn't parse the token as an int or float number so return a string token
274        return token;
275      }
276
277      public Token Peek() {
278        return tokens[0];
279      }
280
281      public Token Next() {
282        Token next = tokens[0];
283        tokens.RemoveAt(0);
284        if (tokens.Count == 0) {
285          ReadNextTokens();
286        }
287        return next;
288      }
289
290      public bool HasNext() {
291        return tokens.Count > 0 || !reader.EndOfStream;
292      }
293    }
294    #endregion
295
296    #region parsing
297    private void Parse() {
298      ParseVariableNames();
299      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
300      ParseValues();
301      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
302    }
303
304    private void ParseValues() {
305      while (tokenizer.HasNext()) {
306        List<double> row = new List<double>();
307        row.Add(NextValue(tokenizer));
308        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
309          Expect(tokenizer.SeparatorToken);
310          row.Add(NextValue(tokenizer));
311        }
312        Expect(tokenizer.NewlineToken);
313        // all rows have to have the same number of values           
314        // the first row defines how many samples are needed
315        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
316          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
317            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
318        }
319        // add the current row to the collection of rows and start a new row
320        rowValues.Add(row);
321        row = new List<double>();
322      }
323    }
324
325    private double NextValue(Tokenizer tokenizer) {
326      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
327      Token current = tokenizer.Next();
328      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
329        return double.NaN;
330      } else if (current.type == TokenTypeEnum.Double) {
331        // just take the value
332        return current.doubleValue;
333      }
334      // found an unexpected token => throw error
335      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
336      // this line is never executed because Error() throws an exception
337      throw new InvalidOperationException();
338    }
339
340    private void ParseVariableNames() {
341      // if the first line doesn't start with a double value then we assume that the
342      // first line contains variable names
343      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
344
345        List<Token> tokens = new List<Token>();
346        Token valueToken;
347        valueToken = tokenizer.Next();
348        tokens.Add(valueToken);
349        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
350          Expect(tokenizer.SeparatorToken);
351          valueToken = tokenizer.Next();
352          if (valueToken != tokenizer.NewlineToken) {
353            tokens.Add(valueToken);
354          }
355        }
356        if (valueToken != tokenizer.NewlineToken) {
357          Expect(tokenizer.NewlineToken);
358        }
359        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
360      }
361    }
362
363    private void Expect(Token expectedToken) {
364      Token actualToken = tokenizer.Next();
365      if (actualToken != expectedToken) {
366        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
367      }
368    }
369
370    private void Error(string message, string token, int lineNumber) {
371      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
372    }
373    #endregion
374  }
375}
Note: See TracBrowser for help on using the repository browser.