Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/CsvFileParser.cs @ 4239

Last change on this file since 4239 was 4239, checked in by gkronber, 14 years ago

Merged improvements of symbolic simplifier (revisions: r4220, r4226, r4235:4238) back into trunk. #1026

File size: 10.5 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
[3264]3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
[2446]26using System.Linq;
27using System.Text;
[2]28
[3373]29namespace HeuristicLab.Problems.DataAnalysis {
[3264]30  public class CsvFileParser {
[273]31    private const string VARIABLENAMES = "VARIABLENAMES";
[2]32    private Tokenizer tokenizer;
[3264]33    private List<string> variableNames;
34    private List<List<double>> rowValues;
[2]35
36    private int rows;
37    public int Rows {
38      get { return rows; }
39      set { rows = value; }
40    }
41
42    private int columns;
43    public int Columns {
44      get { return columns; }
45      set { columns = value; }
46    }
47
[3264]48    private double[,] values;
49    public double[,] Values {
[2]50      get {
[3264]51        return values;
[2]52      }
53    }
54
[3264]55    public IEnumerable<string> VariableNames {
[2]56      get {
[3264]57        if (variableNames.Count > 0) return variableNames;
58        else {
[273]59          string[] names = new string[columns];
[1221]60          for (int i = 0; i < names.Length; i++) {
[273]61            names[i] = "X" + i.ToString("000");
62          }
63          return names;
[2]64        }
65      }
66    }
67
[3264]68    public CsvFileParser() {
69      rowValues = new List<List<double>>();
70      variableNames = new List<string>();
[2]71    }
72
[3264]73    private void Reset() {
74      variableNames.Clear();
75      rowValues.Clear();
[2]76    }
77
[3264]78    public void Parse(string fileName) {
79      TryParse(fileName);
[2]80      // translate the list of samples into a DoubleMatrixData item
[3264]81      rows = rowValues.Count;
82      columns = rowValues[0].Count;
83      values = new double[rows, columns];
[2]84
[3264]85      int rowIndex = 0;
86      int columnIndex = 0;
87      foreach (List<double> row in rowValues) {
88        columnIndex = 0;
[1221]89        foreach (double element in row) {
[3264]90          values[rowIndex, columnIndex++] = element;
[2]91        }
[3264]92        rowIndex++;
[2]93      }
94    }
95
[3264]96    private void TryParse(string fileName) {
[405]97      Exception lastEx = null;
[3889]98      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { CultureInfo.InvariantCulture.NumberFormat };
[1221]99      foreach (NumberFormatInfo numberFormat in possibleFormats) {
[3264]100        using (StreamReader reader = new StreamReader(fileName)) {
[405]101          tokenizer = new Tokenizer(reader, numberFormat);
102          try {
103            // parse the file
[3264]104            Parse();
[405]105            return; // parsed without errors -> return;
[1221]106          }
107          catch (DataFormatException ex) {
[405]108            lastEx = ex;
109          }
110        }
111      }
112      // all number formats threw an exception -> rethrow the last exception
113      throw lastEx;
114    }
115
[2]116    #region tokenizer
117    internal enum TokenTypeEnum {
[3264]118      NewLine, Separator, String, Double
[2]119    }
120
121    internal class Token {
122      public TokenTypeEnum type;
123      public string stringValue;
124      public double doubleValue;
125
126      public Token(TokenTypeEnum type, string value) {
127        this.type = type;
128        stringValue = value;
129        doubleValue = 0.0;
130      }
131
132      public override string ToString() {
133        return stringValue;
134      }
135    }
136
137
[3264]138    internal class Tokenizer {
[2]139      private StreamReader reader;
140      private List<Token> tokens;
[405]141      private NumberFormatInfo numberFormatInfo;
[2]142
[3264]143      private int currentLineNumber = 0;
144      public int CurrentLineNumber {
145        get { return currentLineNumber; }
146        private set { currentLineNumber = value; }
147      }
148      private string currentLine;
149      public string CurrentLine {
150        get { return currentLine; }
151        private set { currentLine = value; }
152      }
[2]153
[3264]154      private Token newlineToken;
155      public Token NewlineToken {
156        get { return newlineToken; }
157        private set { newlineToken = value; }
158      }
159      private Token separatorToken;
160      public Token SeparatorToken {
161        get { return separatorToken; }
162        private set { separatorToken = value; }
163      }
[2]164
[3264]165      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
[2]166        this.reader = reader;
[405]167        this.numberFormatInfo = numberFormatInfo;
[3264]168        separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
169        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]170        tokens = new List<Token>();
171        ReadNextTokens();
172      }
[3264]173      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
174        : this(reader, numberFormatInfo, ';') {
175      }
[2]176
177      private void ReadNextTokens() {
[1221]178        if (!reader.EndOfStream) {
[2]179          CurrentLine = reader.ReadLine();
[2446]180          var newTokens = from str in Split(CurrentLine)
181                          let trimmedStr = str.Trim()
182                          where !string.IsNullOrEmpty(trimmedStr)
183                          select MakeToken(trimmedStr.Trim());
[2]184
[2446]185          tokens.AddRange(newTokens);
[2]186          tokens.Add(NewlineToken);
187          CurrentLineNumber++;
188        }
189      }
190
[2446]191      private IEnumerable<string> Split(string line) {
192        StringBuilder subStr = new StringBuilder();
193        foreach (char c in line) {
[3264]194          if (c == ';') {
[2446]195            yield return subStr.ToString();
196            subStr = new StringBuilder();
197            yield return c.ToString();
198          } else {
199            subStr.Append(c);
200          }
201        }
202        yield return subStr.ToString();
203      }
204
[2]205      private Token MakeToken(string strToken) {
[406]206        Token token = new Token(TokenTypeEnum.String, strToken);
[3264]207        if (strToken.Equals(SeparatorToken.stringValue)) {
[2446]208          return SeparatorToken;
[1221]209        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]210          token.type = TokenTypeEnum.Double;
211          return token;
[2]212        }
[2446]213
[406]214        // couldn't parse the token as an int or float number so return a string token
215        return token;
[2]216      }
217
218      public Token Peek() {
219        return tokens[0];
220      }
221
222      public Token Next() {
223        Token next = tokens[0];
224        tokens.RemoveAt(0);
[1221]225        if (tokens.Count == 0) {
[2]226          ReadNextTokens();
227        }
228        return next;
229      }
230
231      public bool HasNext() {
232        return tokens.Count > 0 || !reader.EndOfStream;
233      }
234    }
235    #endregion
236
237    #region parsing
[3264]238    private void Parse() {
239      ParseVariableNames();
[1221]240      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]241      ParseValues();
242      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]243    }
244
[3264]245    private void ParseValues() {
[1221]246      while (tokenizer.HasNext()) {
[2446]247        List<double> row = new List<double>();
[3264]248        row.Add(NextValue(tokenizer));
249        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
250          Expect(tokenizer.SeparatorToken);
251          row.Add(NextValue(tokenizer));
[2446]252        }
[3264]253        Expect(tokenizer.NewlineToken);
254        // all rows have to have the same number of values           
255        // the first row defines how many samples are needed
256        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
257          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
258            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2446]259        }
260        // add the current row to the collection of rows and start a new row
[3264]261        rowValues.Add(row);
[2446]262        row = new List<double>();
263      }
264    }
265
[3264]266    private double NextValue(Tokenizer tokenizer) {
267      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
[2446]268      Token current = tokenizer.Next();
[3264]269      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
[2446]270        return double.NaN;
271      } else if (current.type == TokenTypeEnum.Double) {
272        // just take the value
273        return current.doubleValue;
[2]274      }
[3264]275      // found an unexpected token => throw error
276      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
277      // this line is never executed because Error() throws an exception
278      throw new InvalidOperationException();
[2]279    }
280
[3264]281    private void ParseVariableNames() {
282      // if the first line doesn't start with a double value then we assume that the
283      // first line contains variable names
284      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
[2446]285
[2]286        List<Token> tokens = new List<Token>();
[1221]287        Token valueToken;
288        valueToken = tokenizer.Next();
[2446]289        tokens.Add(valueToken);
[3264]290        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
291          Expect(tokenizer.SeparatorToken);
[2]292          valueToken = tokenizer.Next();
[3264]293          if (valueToken != tokenizer.NewlineToken) {
[2446]294            tokens.Add(valueToken);
295          }
[2]296        }
[3264]297        if (valueToken != tokenizer.NewlineToken) {
298          Expect(tokenizer.NewlineToken);
[2446]299        }
[3264]300        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]301      }
302    }
303
304    private void Expect(Token expectedToken) {
305      Token actualToken = tokenizer.Next();
[1221]306      if (actualToken != expectedToken) {
[273]307        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]308      }
309    }
310
[273]311    private void Error(string message, string token, int lineNumber) {
312      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]313    }
314    #endregion
315  }
316}
Note: See TracBrowser for help on using the repository browser.