Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/CsvFileParser.cs @ 3373

Last change on this file since 3373 was 3373, checked in by gkronber, 14 years ago

Refactored HeuristicLab.Problems.DataAnalysis namespace. #938 (Data types and operators for regression problems)

File size: 10.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
26using System.Linq;
27using HeuristicLab.Data;
28using System.Text;
29
30namespace HeuristicLab.Problems.DataAnalysis {
31  public class CsvFileParser {
32    private const string VARIABLENAMES = "VARIABLENAMES";
33    private Tokenizer tokenizer;
34    private List<string> variableNames;
35    private List<List<double>> rowValues;
36
37    private int rows;
38    public int Rows {
39      get { return rows; }
40      set { rows = value; }
41    }
42
43    private int columns;
44    public int Columns {
45      get { return columns; }
46      set { columns = value; }
47    }
48
49    private double[,] values;
50    public double[,] Values {
51      get {
52        return values;
53      }
54    }
55
56    public IEnumerable<string> VariableNames {
57      get {
58        if (variableNames.Count > 0) return variableNames;
59        else {
60          string[] names = new string[columns];
61          for (int i = 0; i < names.Length; i++) {
62            names[i] = "X" + i.ToString("000");
63          }
64          return names;
65        }
66      }
67    }
68
69    public CsvFileParser() {
70      rowValues = new List<List<double>>();
71      variableNames = new List<string>();
72    }
73
74    private void Reset() {
75      variableNames.Clear();
76      rowValues.Clear();
77    }
78
79    public void Parse(string fileName) {
80      TryParse(fileName);
81      // translate the list of samples into a DoubleMatrixData item
82      rows = rowValues.Count;
83      columns = rowValues[0].Count;
84      values = new double[rows, columns];
85
86      int rowIndex = 0;
87      int columnIndex = 0;
88      foreach (List<double> row in rowValues) {
89        columnIndex = 0;
90        foreach (double element in row) {
91          values[rowIndex, columnIndex++] = element;
92        }
93        rowIndex++;
94      }
95    }
96
97    private void TryParse(string fileName) {
98      Exception lastEx = null;
99      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo };
100      foreach (NumberFormatInfo numberFormat in possibleFormats) {
101        using (StreamReader reader = new StreamReader(fileName)) {
102          tokenizer = new Tokenizer(reader, numberFormat);
103          try {
104            // parse the file
105            Parse();
106            return; // parsed without errors -> return;
107          }
108          catch (DataFormatException ex) {
109            lastEx = ex;
110          }
111        }
112      }
113      // all number formats threw an exception -> rethrow the last exception
114      throw lastEx;
115    }
116
117    #region tokenizer
118    internal enum TokenTypeEnum {
119      NewLine, Separator, String, Double
120    }
121
122    internal class Token {
123      public TokenTypeEnum type;
124      public string stringValue;
125      public double doubleValue;
126
127      public Token(TokenTypeEnum type, string value) {
128        this.type = type;
129        stringValue = value;
130        doubleValue = 0.0;
131      }
132
133      public override string ToString() {
134        return stringValue;
135      }
136    }
137
138
139    internal class Tokenizer {
140      private StreamReader reader;
141      private List<Token> tokens;
142      private NumberFormatInfo numberFormatInfo;
143
144      private int currentLineNumber = 0;
145      public int CurrentLineNumber {
146        get { return currentLineNumber; }
147        private set { currentLineNumber = value; }
148      }
149      private string currentLine;
150      public string CurrentLine {
151        get { return currentLine; }
152        private set { currentLine = value; }
153      }
154
155      private Token newlineToken;
156      public Token NewlineToken {
157        get { return newlineToken; }
158        private set { newlineToken = value; }
159      }
160      private Token separatorToken;
161      public Token SeparatorToken {
162        get { return separatorToken; }
163        private set { separatorToken = value; }
164      }
165
166      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
167        this.reader = reader;
168        this.numberFormatInfo = numberFormatInfo;
169        separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
170        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
171        tokens = new List<Token>();
172        ReadNextTokens();
173      }
174      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
175        : this(reader, numberFormatInfo, ';') {
176      }
177
178      private void ReadNextTokens() {
179        if (!reader.EndOfStream) {
180          CurrentLine = reader.ReadLine();
181          var newTokens = from str in Split(CurrentLine)
182                          let trimmedStr = str.Trim()
183                          where !string.IsNullOrEmpty(trimmedStr)
184                          select MakeToken(trimmedStr.Trim());
185
186          tokens.AddRange(newTokens);
187          tokens.Add(NewlineToken);
188          CurrentLineNumber++;
189        }
190      }
191
192      private IEnumerable<string> Split(string line) {
193        StringBuilder subStr = new StringBuilder();
194        foreach (char c in line) {
195          if (c == ';') {
196            yield return subStr.ToString();
197            subStr = new StringBuilder();
198            yield return c.ToString();
199          } else {
200            subStr.Append(c);
201          }
202        }
203        yield return subStr.ToString();
204      }
205
206      private Token MakeToken(string strToken) {
207        Token token = new Token(TokenTypeEnum.String, strToken);
208        if (strToken.Equals(SeparatorToken.stringValue)) {
209          return SeparatorToken;
210        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
211          token.type = TokenTypeEnum.Double;
212          return token;
213        }
214
215        // couldn't parse the token as an int or float number so return a string token
216        return token;
217      }
218
219      public Token Peek() {
220        return tokens[0];
221      }
222
223      public Token Next() {
224        Token next = tokens[0];
225        tokens.RemoveAt(0);
226        if (tokens.Count == 0) {
227          ReadNextTokens();
228        }
229        return next;
230      }
231
232      public bool HasNext() {
233        return tokens.Count > 0 || !reader.EndOfStream;
234      }
235    }
236    #endregion
237
238    #region parsing
239    private void Parse() {
240      ParseVariableNames();
241      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
242      ParseValues();
243      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
244    }
245
246    private void ParseValues() {
247      while (tokenizer.HasNext()) {
248        List<double> row = new List<double>();
249        row.Add(NextValue(tokenizer));
250        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
251          Expect(tokenizer.SeparatorToken);
252          row.Add(NextValue(tokenizer));
253        }
254        Expect(tokenizer.NewlineToken);
255        // all rows have to have the same number of values           
256        // the first row defines how many samples are needed
257        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
258          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
259            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
260        }
261        // add the current row to the collection of rows and start a new row
262        rowValues.Add(row);
263        row = new List<double>();
264      }
265    }
266
267    private double NextValue(Tokenizer tokenizer) {
268      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
269      Token current = tokenizer.Next();
270      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
271        return double.NaN;
272      } else if (current.type == TokenTypeEnum.Double) {
273        // just take the value
274        return current.doubleValue;
275      }
276      // found an unexpected token => throw error
277      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
278      // this line is never executed because Error() throws an exception
279      throw new InvalidOperationException();
280    }
281
282    private void ParseVariableNames() {
283      // if the first line doesn't start with a double value then we assume that the
284      // first line contains variable names
285      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
286
287        List<Token> tokens = new List<Token>();
288        Token valueToken;
289        valueToken = tokenizer.Next();
290        tokens.Add(valueToken);
291        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
292          Expect(tokenizer.SeparatorToken);
293          valueToken = tokenizer.Next();
294          if (valueToken != tokenizer.NewlineToken) {
295            tokens.Add(valueToken);
296          }
297        }
298        if (valueToken != tokenizer.NewlineToken) {
299          Expect(tokenizer.NewlineToken);
300        }
301        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
302      }
303    }
304
305    private void Expect(Token expectedToken) {
306      Token actualToken = tokenizer.Next();
307      if (actualToken != expectedToken) {
308        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
309      }
310    }
311
312    private void Error(string message, string token, int lineNumber) {
313      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
314    }
315    #endregion
316  }
317}
Note: See TracBrowser for help on using the repository browser.