Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/CsvFileParser.cs @ 3491

Last change on this file since 3491 was 3373, checked in by gkronber, 14 years ago

Refactored HeuristicLab.Problems.DataAnalysis namespace. #938 (Data types and operators for regression problems)

File size: 10.5 KB
RevLine 
[2]1#region License Information
2/* HeuristicLab
[3264]3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.IO;
[2446]26using System.Linq;
[2]27using HeuristicLab.Data;
[2446]28using System.Text;
[2]29
[3373]30namespace HeuristicLab.Problems.DataAnalysis {
[3264]31  public class CsvFileParser {
[273]32    private const string VARIABLENAMES = "VARIABLENAMES";
[2]33    private Tokenizer tokenizer;
[3264]34    private List<string> variableNames;
35    private List<List<double>> rowValues;
[2]36
37    private int rows;
38    public int Rows {
39      get { return rows; }
40      set { rows = value; }
41    }
42
43    private int columns;
44    public int Columns {
45      get { return columns; }
46      set { columns = value; }
47    }
48
[3264]49    private double[,] values;
50    public double[,] Values {
[2]51      get {
[3264]52        return values;
[2]53      }
54    }
55
[3264]56    public IEnumerable<string> VariableNames {
[2]57      get {
[3264]58        if (variableNames.Count > 0) return variableNames;
59        else {
[273]60          string[] names = new string[columns];
[1221]61          for (int i = 0; i < names.Length; i++) {
[273]62            names[i] = "X" + i.ToString("000");
63          }
64          return names;
[2]65        }
66      }
67    }
68
[3264]69    public CsvFileParser() {
70      rowValues = new List<List<double>>();
71      variableNames = new List<string>();
[2]72    }
73
[3264]74    private void Reset() {
75      variableNames.Clear();
76      rowValues.Clear();
[2]77    }
78
[3264]79    public void Parse(string fileName) {
80      TryParse(fileName);
[2]81      // translate the list of samples into a DoubleMatrixData item
[3264]82      rows = rowValues.Count;
83      columns = rowValues[0].Count;
84      values = new double[rows, columns];
[2]85
[3264]86      int rowIndex = 0;
87      int columnIndex = 0;
88      foreach (List<double> row in rowValues) {
89        columnIndex = 0;
[1221]90        foreach (double element in row) {
[3264]91          values[rowIndex, columnIndex++] = element;
[2]92        }
[3264]93        rowIndex++;
[2]94      }
95    }
96
[3264]97    private void TryParse(string fileName) {
[405]98      Exception lastEx = null;
[3264]99      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo };
[1221]100      foreach (NumberFormatInfo numberFormat in possibleFormats) {
[3264]101        using (StreamReader reader = new StreamReader(fileName)) {
[405]102          tokenizer = new Tokenizer(reader, numberFormat);
103          try {
104            // parse the file
[3264]105            Parse();
[405]106            return; // parsed without errors -> return;
[1221]107          }
108          catch (DataFormatException ex) {
[405]109            lastEx = ex;
110          }
111        }
112      }
113      // all number formats threw an exception -> rethrow the last exception
114      throw lastEx;
115    }
116
[2]117    #region tokenizer
118    internal enum TokenTypeEnum {
[3264]119      NewLine, Separator, String, Double
[2]120    }
121
122    internal class Token {
123      public TokenTypeEnum type;
124      public string stringValue;
125      public double doubleValue;
126
127      public Token(TokenTypeEnum type, string value) {
128        this.type = type;
129        stringValue = value;
130        doubleValue = 0.0;
131      }
132
133      public override string ToString() {
134        return stringValue;
135      }
136    }
137
138
[3264]139    internal class Tokenizer {
[2]140      private StreamReader reader;
141      private List<Token> tokens;
[405]142      private NumberFormatInfo numberFormatInfo;
[2]143
[3264]144      private int currentLineNumber = 0;
145      public int CurrentLineNumber {
146        get { return currentLineNumber; }
147        private set { currentLineNumber = value; }
148      }
149      private string currentLine;
150      public string CurrentLine {
151        get { return currentLine; }
152        private set { currentLine = value; }
153      }
[2]154
[3264]155      private Token newlineToken;
156      public Token NewlineToken {
157        get { return newlineToken; }
158        private set { newlineToken = value; }
159      }
160      private Token separatorToken;
161      public Token SeparatorToken {
162        get { return separatorToken; }
163        private set { separatorToken = value; }
164      }
[2]165
[3264]166      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
[2]167        this.reader = reader;
[405]168        this.numberFormatInfo = numberFormatInfo;
[3264]169        separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
170        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]171        tokens = new List<Token>();
172        ReadNextTokens();
173      }
[3264]174      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
175        : this(reader, numberFormatInfo, ';') {
176      }
[2]177
178      private void ReadNextTokens() {
[1221]179        if (!reader.EndOfStream) {
[2]180          CurrentLine = reader.ReadLine();
[2446]181          var newTokens = from str in Split(CurrentLine)
182                          let trimmedStr = str.Trim()
183                          where !string.IsNullOrEmpty(trimmedStr)
184                          select MakeToken(trimmedStr.Trim());
[2]185
[2446]186          tokens.AddRange(newTokens);
[2]187          tokens.Add(NewlineToken);
188          CurrentLineNumber++;
189        }
190      }
191
[2446]192      private IEnumerable<string> Split(string line) {
193        StringBuilder subStr = new StringBuilder();
194        foreach (char c in line) {
[3264]195          if (c == ';') {
[2446]196            yield return subStr.ToString();
197            subStr = new StringBuilder();
198            yield return c.ToString();
199          } else {
200            subStr.Append(c);
201          }
202        }
203        yield return subStr.ToString();
204      }
205
[2]206      private Token MakeToken(string strToken) {
[406]207        Token token = new Token(TokenTypeEnum.String, strToken);
[3264]208        if (strToken.Equals(SeparatorToken.stringValue)) {
[2446]209          return SeparatorToken;
[1221]210        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]211          token.type = TokenTypeEnum.Double;
212          return token;
[2]213        }
[2446]214
[406]215        // couldn't parse the token as an int or float number so return a string token
216        return token;
[2]217      }
218
219      public Token Peek() {
220        return tokens[0];
221      }
222
223      public Token Next() {
224        Token next = tokens[0];
225        tokens.RemoveAt(0);
[1221]226        if (tokens.Count == 0) {
[2]227          ReadNextTokens();
228        }
229        return next;
230      }
231
232      public bool HasNext() {
233        return tokens.Count > 0 || !reader.EndOfStream;
234      }
235    }
236    #endregion
237
238    #region parsing
[3264]239    private void Parse() {
240      ParseVariableNames();
[1221]241      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]242      ParseValues();
243      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]244    }
245
[3264]246    private void ParseValues() {
[1221]247      while (tokenizer.HasNext()) {
[2446]248        List<double> row = new List<double>();
[3264]249        row.Add(NextValue(tokenizer));
250        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
251          Expect(tokenizer.SeparatorToken);
252          row.Add(NextValue(tokenizer));
[2446]253        }
[3264]254        Expect(tokenizer.NewlineToken);
255        // all rows have to have the same number of values           
256        // the first row defines how many samples are needed
257        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
258          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
259            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2446]260        }
261        // add the current row to the collection of rows and start a new row
[3264]262        rowValues.Add(row);
[2446]263        row = new List<double>();
264      }
265    }
266
[3264]267    private double NextValue(Tokenizer tokenizer) {
268      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
[2446]269      Token current = tokenizer.Next();
[3264]270      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
[2446]271        return double.NaN;
272      } else if (current.type == TokenTypeEnum.Double) {
273        // just take the value
274        return current.doubleValue;
[2]275      }
[3264]276      // found an unexpected token => throw error
277      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
278      // this line is never executed because Error() throws an exception
279      throw new InvalidOperationException();
[2]280    }
281
[3264]282    private void ParseVariableNames() {
283      // if the first line doesn't start with a double value then we assume that the
284      // first line contains variable names
285      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
[2446]286
[2]287        List<Token> tokens = new List<Token>();
[1221]288        Token valueToken;
289        valueToken = tokenizer.Next();
[2446]290        tokens.Add(valueToken);
[3264]291        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
292          Expect(tokenizer.SeparatorToken);
[2]293          valueToken = tokenizer.Next();
[3264]294          if (valueToken != tokenizer.NewlineToken) {
[2446]295            tokens.Add(valueToken);
296          }
[2]297        }
[3264]298        if (valueToken != tokenizer.NewlineToken) {
299          Expect(tokenizer.NewlineToken);
[2446]300        }
[3264]301        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]302      }
303    }
304
305    private void Expect(Token expectedToken) {
306      Token actualToken = tokenizer.Next();
[1221]307      if (actualToken != expectedToken) {
[273]308        Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]309      }
310    }
311
[273]312    private void Error(string message, string token, int lineNumber) {
313      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]314    }
315    #endregion
316  }
317}
Note: See TracBrowser for help on using the repository browser.