Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis/3.3/CsvFileParser.cs @ 4791

Last change on this file since 4791 was 4341, checked in by gkronber, 14 years ago

Merged changesets from revisions r4249, r4250, r4251, r4291, r4295 from trunk into data analysis exploration #1142.

File size: 10.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using System.Linq;
27using System.Text;
28
29namespace HeuristicLab.Problems.DataAnalysis {
30  public class CsvFileParser {
31    private const string VARIABLENAMES = "VARIABLENAMES";
32    private Tokenizer tokenizer;
33    private List<string> variableNames;
34    private List<List<double>> rowValues;
35
36    private int rows;
37    public int Rows {
38      get { return rows; }
39      set { rows = value; }
40    }
41
42    private int columns;
43    public int Columns {
44      get { return columns; }
45      set { columns = value; }
46    }
47
48    private double[,] values;
49    public double[,] Values {
50      get {
51        return values;
52      }
53    }
54
55    public IEnumerable<string> VariableNames {
56      get {
57        if (variableNames.Count > 0) return variableNames;
58        else {
59          string[] names = new string[columns];
60          for (int i = 0; i < names.Length; i++) {
61            names[i] = "X" + i.ToString("000");
62          }
63          return names;
64        }
65      }
66    }
67
68    public CsvFileParser() {
69      rowValues = new List<List<double>>();
70      variableNames = new List<string>();
71    }
72
73    private void Reset() {
74      variableNames.Clear();
75      rowValues.Clear();
76    }
77
78    public void Parse(string fileName) {
79      TryParse(fileName);
80      // translate the list of samples into a DoubleMatrixData item
81      rows = rowValues.Count;
82      columns = rowValues[0].Count;
83      values = new double[rows, columns];
84
85      int rowIndex = 0;
86      int columnIndex = 0;
87      foreach (List<double> row in rowValues) {
88        columnIndex = 0;
89        foreach (double element in row) {
90          values[rowIndex, columnIndex++] = element;
91        }
92        rowIndex++;
93      }
94    }
95
96    private void TryParse(string fileName) {
97      Exception lastEx = null;
98      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { CultureInfo.InvariantCulture.NumberFormat };
99      foreach (NumberFormatInfo numberFormat in possibleFormats) {
100        using (StreamReader reader = new StreamReader(fileName)) {
101          tokenizer = new Tokenizer(reader, numberFormat);
102          try {
103            // parse the file
104            Parse();
105            return; // parsed without errors -> return;
106          }
107          catch (DataFormatException ex) {
108            lastEx = ex;
109          }
110        }
111      }
112      // all number formats threw an exception -> rethrow the last exception
113      throw lastEx;
114    }
115
116    #region tokenizer
117    internal enum TokenTypeEnum {
118      NewLine, Separator, String, Double
119    }
120
121    internal class Token {
122      public TokenTypeEnum type;
123      public string stringValue;
124      public double doubleValue;
125
126      public Token(TokenTypeEnum type, string value) {
127        this.type = type;
128        stringValue = value;
129        doubleValue = 0.0;
130      }
131
132      public override string ToString() {
133        return stringValue;
134      }
135    }
136
137
138    internal class Tokenizer {
139      private StreamReader reader;
140      private List<Token> tokens;
141      private NumberFormatInfo numberFormatInfo;
142
143      private int currentLineNumber = 0;
144      public int CurrentLineNumber {
145        get { return currentLineNumber; }
146        private set { currentLineNumber = value; }
147      }
148      private string currentLine;
149      public string CurrentLine {
150        get { return currentLine; }
151        private set { currentLine = value; }
152      }
153
154      private Token newlineToken;
155      public Token NewlineToken {
156        get { return newlineToken; }
157        private set { newlineToken = value; }
158      }
159      private Token separatorToken;
160      public Token SeparatorToken {
161        get { return separatorToken; }
162        private set { separatorToken = value; }
163      }
164
165      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
166        this.reader = reader;
167        this.numberFormatInfo = numberFormatInfo;
168        separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
169        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
170        tokens = new List<Token>();
171        ReadNextTokens();
172      }
173      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
174        : this(reader, numberFormatInfo, ';') {
175      }
176
177      private void ReadNextTokens() {
178        if (!reader.EndOfStream) {
179          CurrentLine = reader.ReadLine();
180          var newTokens = from str in Split(CurrentLine)
181                          let trimmedStr = str.Trim()
182                          where !string.IsNullOrEmpty(trimmedStr)
183                          select MakeToken(trimmedStr.Trim());
184
185          tokens.AddRange(newTokens);
186          tokens.Add(NewlineToken);
187          CurrentLineNumber++;
188        }
189      }
190
191      private IEnumerable<string> Split(string line) {
192        StringBuilder subStr = new StringBuilder();
193        foreach (char c in line) {
194          if (c == ';') {
195            yield return subStr.ToString();
196            subStr = new StringBuilder();
197            yield return c.ToString();
198          } else {
199            subStr.Append(c);
200          }
201        }
202        yield return subStr.ToString();
203      }
204
205      private Token MakeToken(string strToken) {
206        Token token = new Token(TokenTypeEnum.String, strToken);
207        if (strToken.Equals(SeparatorToken.stringValue)) {
208          return SeparatorToken;
209        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
210          token.type = TokenTypeEnum.Double;
211          return token;
212        }
213
214        // couldn't parse the token as an int or float number so return a string token
215        return token;
216      }
217
218      public Token Peek() {
219        return tokens[0];
220      }
221
222      public Token Next() {
223        Token next = tokens[0];
224        tokens.RemoveAt(0);
225        if (tokens.Count == 0) {
226          ReadNextTokens();
227        }
228        return next;
229      }
230
231      public bool HasNext() {
232        return tokens.Count > 0 || !reader.EndOfStream;
233      }
234    }
235    #endregion
236
237    #region parsing
238    private void Parse() {
239      ParseVariableNames();
240      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
241      ParseValues();
242      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
243    }
244
245    private void ParseValues() {
246      while (tokenizer.HasNext()) {
247        List<double> row = new List<double>();
248        row.Add(NextValue(tokenizer));
249        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
250          Expect(tokenizer.SeparatorToken);
251          row.Add(NextValue(tokenizer));
252        }
253        Expect(tokenizer.NewlineToken);
254        // all rows have to have the same number of values           
255        // the first row defines how many samples are needed
256        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
257          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
258            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
259        }
260        // add the current row to the collection of rows and start a new row
261        rowValues.Add(row);
262        row = new List<double>();
263      }
264    }
265
266    private double NextValue(Tokenizer tokenizer) {
267      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
268      Token current = tokenizer.Next();
269      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
270        return double.NaN;
271      } else if (current.type == TokenTypeEnum.Double) {
272        // just take the value
273        return current.doubleValue;
274      }
275      // found an unexpected token => throw error
276      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
277      // this line is never executed because Error() throws an exception
278      throw new InvalidOperationException();
279    }
280
281    private void ParseVariableNames() {
282      // if the first line doesn't start with a double value then we assume that the
283      // first line contains variable names
284      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
285
286        List<Token> tokens = new List<Token>();
287        Token valueToken;
288        valueToken = tokenizer.Next();
289        tokens.Add(valueToken);
290        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
291          Expect(tokenizer.SeparatorToken);
292          valueToken = tokenizer.Next();
293          if (valueToken != tokenizer.NewlineToken) {
294            tokens.Add(valueToken);
295          }
296        }
297        if (valueToken != tokenizer.NewlineToken) {
298          Expect(tokenizer.NewlineToken);
299        }
300        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
301      }
302    }
303
304    private void Expect(Token expectedToken) {
305      Token actualToken = tokenizer.Next();
306      if (actualToken != expectedToken) {
307        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
308      }
309    }
310
311    private void Error(string message, string token, int lineNumber) {
312      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
313    }
314    #endregion
315  }
316}
Note: See TracBrowser for help on using the repository browser.