source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/CsvFileParser.cs @ 4239

Last change on this file since 4239 was 4239, checked in by gkronber, 12 years ago

Merged improvements of symbolic simplifier (revisions: r4220, r4226, r4235:4238) back into trunk. #1026

File size: 10.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using System.Linq;
27using System.Text;
28
29namespace HeuristicLab.Problems.DataAnalysis {
30  public class CsvFileParser {
31    private const string VARIABLENAMES = "VARIABLENAMES";
32    private Tokenizer tokenizer;
33    private List<string> variableNames;
34    private List<List<double>> rowValues;
35
36    private int rows;
37    public int Rows {
38      get { return rows; }
39      set { rows = value; }
40    }
41
42    private int columns;
43    public int Columns {
44      get { return columns; }
45      set { columns = value; }
46    }
47
48    private double[,] values;
49    public double[,] Values {
50      get {
51        return values;
52      }
53    }
54
55    public IEnumerable<string> VariableNames {
56      get {
57        if (variableNames.Count > 0) return variableNames;
58        else {
59          string[] names = new string[columns];
60          for (int i = 0; i < names.Length; i++) {
61            names[i] = "X" + i.ToString("000");
62          }
63          return names;
64        }
65      }
66    }
67
68    public CsvFileParser() {
69      rowValues = new List<List<double>>();
70      variableNames = new List<string>();
71    }
72
73    private void Reset() {
74      variableNames.Clear();
75      rowValues.Clear();
76    }
77
78    public void Parse(string fileName) {
79      TryParse(fileName);
80      // translate the list of samples into a DoubleMatrixData item
81      rows = rowValues.Count;
82      columns = rowValues[0].Count;
83      values = new double[rows, columns];
84
85      int rowIndex = 0;
86      int columnIndex = 0;
87      foreach (List<double> row in rowValues) {
88        columnIndex = 0;
89        foreach (double element in row) {
90          values[rowIndex, columnIndex++] = element;
91        }
92        rowIndex++;
93      }
94    }
95
96    private void TryParse(string fileName) {
97      Exception lastEx = null;
98      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { CultureInfo.InvariantCulture.NumberFormat };
99      foreach (NumberFormatInfo numberFormat in possibleFormats) {
100        using (StreamReader reader = new StreamReader(fileName)) {
101          tokenizer = new Tokenizer(reader, numberFormat);
102          try {
103            // parse the file
104            Parse();
105            return; // parsed without errors -> return;
106          }
107          catch (DataFormatException ex) {
108            lastEx = ex;
109          }
110        }
111      }
112      // all number formats threw an exception -> rethrow the last exception
113      throw lastEx;
114    }
115
116    #region tokenizer
117    internal enum TokenTypeEnum {
118      NewLine, Separator, String, Double
119    }
120
121    internal class Token {
122      public TokenTypeEnum type;
123      public string stringValue;
124      public double doubleValue;
125
126      public Token(TokenTypeEnum type, string value) {
127        this.type = type;
128        stringValue = value;
129        doubleValue = 0.0;
130      }
131
132      public override string ToString() {
133        return stringValue;
134      }
135    }
136
137
138    internal class Tokenizer {
139      private StreamReader reader;
140      private List<Token> tokens;
141      private NumberFormatInfo numberFormatInfo;
142
143      private int currentLineNumber = 0;
144      public int CurrentLineNumber {
145        get { return currentLineNumber; }
146        private set { currentLineNumber = value; }
147      }
148      private string currentLine;
149      public string CurrentLine {
150        get { return currentLine; }
151        private set { currentLine = value; }
152      }
153
154      private Token newlineToken;
155      public Token NewlineToken {
156        get { return newlineToken; }
157        private set { newlineToken = value; }
158      }
159      private Token separatorToken;
160      public Token SeparatorToken {
161        get { return separatorToken; }
162        private set { separatorToken = value; }
163      }
164
165      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
166        this.reader = reader;
167        this.numberFormatInfo = numberFormatInfo;
168        separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
169        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
170        tokens = new List<Token>();
171        ReadNextTokens();
172      }
173      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
174        : this(reader, numberFormatInfo, ';') {
175      }
176
177      private void ReadNextTokens() {
178        if (!reader.EndOfStream) {
179          CurrentLine = reader.ReadLine();
180          var newTokens = from str in Split(CurrentLine)
181                          let trimmedStr = str.Trim()
182                          where !string.IsNullOrEmpty(trimmedStr)
183                          select MakeToken(trimmedStr.Trim());
184
185          tokens.AddRange(newTokens);
186          tokens.Add(NewlineToken);
187          CurrentLineNumber++;
188        }
189      }
190
191      private IEnumerable<string> Split(string line) {
192        StringBuilder subStr = new StringBuilder();
193        foreach (char c in line) {
194          if (c == ';') {
195            yield return subStr.ToString();
196            subStr = new StringBuilder();
197            yield return c.ToString();
198          } else {
199            subStr.Append(c);
200          }
201        }
202        yield return subStr.ToString();
203      }
204
205      private Token MakeToken(string strToken) {
206        Token token = new Token(TokenTypeEnum.String, strToken);
207        if (strToken.Equals(SeparatorToken.stringValue)) {
208          return SeparatorToken;
209        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
210          token.type = TokenTypeEnum.Double;
211          return token;
212        }
213
214        // couldn't parse the token as an int or float number so return a string token
215        return token;
216      }
217
218      public Token Peek() {
219        return tokens[0];
220      }
221
222      public Token Next() {
223        Token next = tokens[0];
224        tokens.RemoveAt(0);
225        if (tokens.Count == 0) {
226          ReadNextTokens();
227        }
228        return next;
229      }
230
231      public bool HasNext() {
232        return tokens.Count > 0 || !reader.EndOfStream;
233      }
234    }
235    #endregion
236
237    #region parsing
238    private void Parse() {
239      ParseVariableNames();
240      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
241      ParseValues();
242      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
243    }
244
245    private void ParseValues() {
246      while (tokenizer.HasNext()) {
247        List<double> row = new List<double>();
248        row.Add(NextValue(tokenizer));
249        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
250          Expect(tokenizer.SeparatorToken);
251          row.Add(NextValue(tokenizer));
252        }
253        Expect(tokenizer.NewlineToken);
254        // all rows have to have the same number of values           
255        // the first row defines how many samples are needed
256        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
257          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
258            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
259        }
260        // add the current row to the collection of rows and start a new row
261        rowValues.Add(row);
262        row = new List<double>();
263      }
264    }
265
266    private double NextValue(Tokenizer tokenizer) {
267      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
268      Token current = tokenizer.Next();
269      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
270        return double.NaN;
271      } else if (current.type == TokenTypeEnum.Double) {
272        // just take the value
273        return current.doubleValue;
274      }
275      // found an unexpected token => throw error
276      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
277      // this line is never executed because Error() throws an exception
278      throw new InvalidOperationException();
279    }
280
281    private void ParseVariableNames() {
282      // if the first line doesn't start with a double value then we assume that the
283      // first line contains variable names
284      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
285
286        List<Token> tokens = new List<Token>();
287        Token valueToken;
288        valueToken = tokenizer.Next();
289        tokens.Add(valueToken);
290        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
291          Expect(tokenizer.SeparatorToken);
292          valueToken = tokenizer.Next();
293          if (valueToken != tokenizer.NewlineToken) {
294            tokens.Add(valueToken);
295          }
296        }
297        if (valueToken != tokenizer.NewlineToken) {
298          Expect(tokenizer.NewlineToken);
299        }
300        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
301      }
302    }
303
304    private void Expect(Token expectedToken) {
305      Token actualToken = tokenizer.Next();
306      if (actualToken != expectedToken) {
307        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
308      }
309    }
310
311    private void Error(string message, string token, int lineNumber) {
312      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
313    }
314    #endregion
315  }
316}
Note: See TracBrowser for help on using the repository browser.