Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 14296

Last change on this file since 14296 was 14296, checked in by gkronber, 8 years ago

#2661: implemented fixes for several problems in the TableFileParser. We now also store the original string representation of all tokens and use those when we detect that a column cannot be read as DateTime / double.

File size: 26.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Diagnostics.Contracts;
27using System.Globalization;
28using System.IO;
29using System.Linq;
30using System.Runtime.Serialization;
31using System.Text;
32
33namespace HeuristicLab.Problems.Instances.DataAnalysis {
34  public class TableFileParser : Progress<long> { // reports the number of bytes read
35    private const int BUFFER_SIZE = 65536;
36    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
37    private const char WHITESPACECHAR = (char)0;
38    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
39    private Tokenizer tokenizer;
40    private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
41
42
43    private Encoding encoding = Encoding.Default;
44
45    public Encoding Encoding {
46      get { return encoding; }
47      set {
48        if (value == null) throw new ArgumentNullException("Encoding");
49        encoding = value;
50      }
51    }
52
53
54    private int rows;
55    public int Rows {
56      get { return rows; }
57      set { rows = value; }
58    }
59
60    private int columns;
61    public int Columns {
62      get { return columns; }
63      set { columns = value; }
64    }
65
66    private List<IList> values;
67    public List<IList> Values {
68      get {
69        return values;
70      }
71    }
72
73    private List<string> variableNames;
74    public IEnumerable<string> VariableNames {
75      get {
76        if (variableNames.Count > 0) return variableNames;
77        else {
78          string[] names = new string[columns];
79          for (int i = 0; i < names.Length; i++) {
80            names[i] = "X" + i.ToString("000");
81          }
82          return names;
83        }
84      }
85    }
86
87    public TableFileParser() {
88      variableNames = new List<string>();
89    }
90
91    public bool AreColumnNamesInFirstLine(string fileName) {
92      NumberFormatInfo numberFormat;
93      DateTimeFormatInfo dateTimeFormatInfo;
94      char separator;
95      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
96      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
97        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
98      }
99    }
100
101    public bool AreColumnNamesInFirstLine(Stream stream) {
102      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
103      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
104      char separator = ',';
105      return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
106    }
107
108    public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
109                                         DateTimeFormatInfo dateTimeFormatInfo, char separator) {
110      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
111        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
112      }
113    }
114
115    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
116                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
117      using (StreamReader reader = new StreamReader(stream, Encoding)) {
118        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
119        return (tokenizer.PeekType() != TokenTypeEnum.Double);
120      }
121    }
122
123    /// <summary>
124    /// Parses a file and determines the format first
125    /// </summary>
126    /// <param name="fileName">file which is parsed</param>
127    /// <param name="columnNamesInFirstLine"></param>
128    public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
129      NumberFormatInfo numberFormat;
130      DateTimeFormatInfo dateTimeFormatInfo;
131      char separator;
132      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
133      EstimateNumberOfLines(fileName);
134      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
135    }
136
137    /// <summary>
138    /// Parses a file with the given formats
139    /// </summary>
140    /// <param name="fileName">file which is parsed</param>
141    /// <param name="numberFormat">Format of numbers</param>
142    /// <param name="dateTimeFormatInfo">Format of datetime</param>
143    /// <param name="separator">defines the separator</param>
144    /// <param name="columnNamesInFirstLine"></param>
145    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
146      EstimateNumberOfLines(fileName);
147      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
148        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
149      }
150    }
151
152    // determines the number of newline characters in the first 64KB to guess the number of rows for a file
153    private void EstimateNumberOfLines(string fileName) {
154      var len = new System.IO.FileInfo(fileName).Length;
155      var buf = new char[1024 * 1024];
156      using (var reader = new StreamReader(fileName, Encoding)) {
157        reader.ReadBlock(buf, 0, buf.Length);
158      }
159      int numNewLine = 0;
160      int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
161      foreach (var ch in buf) {
162        charsInCurrentLine++;
163        if (ch == '\n') {
164          if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
165          charsInCurrentLine = 0;
166          numNewLine++;
167        }
168      }
169      if (numNewLine <= 1) {
170        // fail -> keep the default setting
171        return;
172      } else {
173        double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
174        double estimatedLines = len / charsPerLineFactor;
175        estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
176      }
177    }
178
179    /// <summary>
180    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
181    /// </summary>
182    /// <param name="stream">stream which is parsed</param>
183    /// <param name="columnNamesInFirstLine"></param>
184    public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
185      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
186      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
187      char separator = ',';
188      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
189    }
190
191    /// <summary>
192    /// Parses a stream with the given formats.
193    /// </summary>
194    /// <param name="stream">Stream which is parsed</param>   
195    /// <param name="numberFormat">Format of numbers</param>
196    /// <param name="dateTimeFormatInfo">Format of datetime</param>
197    /// <param name="separator">defines the separator</param>
198    /// <param name="columnNamesInFirstLine"></param>
199    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
200      if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
201
202      using (var reader = new StreamReader(stream)) {
203        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
204        var strValues = new List<List<string>>();
205        values = new List<IList>();
206        Prepare(columnNamesInFirstLine, strValues);
207
208        int nLinesParsed = 0;
209        int colIdx = 0;
210        while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
211          if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
212            tokenizer.Skip();
213
214            // all rows have to have the same number of values
215            // the first row defines how many elements are needed
216            if (colIdx > 0 && values.Count != colIdx) {
217              // read at least one value in the row (support for skipping empty lines)
218              Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
219                    "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
220                tokenizer.CurrentLineNumber);
221            }
222            OnReport(tokenizer.BytesRead);
223
224            nLinesParsed++;
225            colIdx = 0;
226          } else {
227            // read one value
228            TokenTypeEnum type;
229            string strVal;
230            double dblVal;
231            DateTime dateTimeVal;
232            tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
233
234            if (colIdx == values.Count) {
235              Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
236                    "Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
237                tokenizer.CurrentLineNumber);
238            }
239            if (!IsColumnTypeCompatible(values[colIdx], type)) {
240              values[colIdx] = strValues[colIdx];
241            }
242
243            // add the value to the column
244            AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
245            if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string>
246              strValues[colIdx].Add(strVal);
247            }
248            colIdx++;
249          }
250        }
251      }
252
253      if (!values.Any() || values.First().Count == 0)
254        Error("Couldn't parse data values. Probably because of incorrect number format " +
255              "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
256
257      this.rows = values.First().Count;
258      this.columns = values.Count;
259
260      // after everything has been parsed make sure the lists are as compact as possible
261      foreach (var l in values) {
262        var dblList = l as List<double>;
263        var byteList = l as List<byte>;
264        var dateList = l as List<DateTime>;
265        var stringList = l as List<string>;
266        var objList = l as List<object>;
267        if (dblList != null) dblList.TrimExcess();
268        if (byteList != null) byteList.TrimExcess();
269        if (dateList != null) dateList.TrimExcess();
270        if (stringList != null) stringList.TrimExcess();
271        if (objList != null) objList.TrimExcess();
272      }
273
274      // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
275      GC.Collect(2, GCCollectionMode.Forced);
276    }
277
278    private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) {
279      if (columnNamesInFirstLine) {
280        ParseVariableNames();
281        if (!tokenizer.HasNext())
282          Error(
283            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
284            "", tokenizer.CurrentLineNumber);
285      }
286      // read first line to determine types and allocate specific lists
287      // read values... start in first row
288      int colIdx = 0;
289      while (tokenizer.PeekType() != TokenTypeEnum.NewLine) {
290        // read one value
291        TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
292        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
293
294        // initialize column
295        values.Add(CreateList(type, estimatedNumberOfLines));
296        if (type == TokenTypeEnum.String)
297          strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string>
298        else
299          strValues.Add(new List<string>(estimatedNumberOfLines));
300
301        AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
302        if (type != TokenTypeEnum.String)
303          strValues[colIdx].Add(strVal);
304        colIdx++;
305      }
306      tokenizer.Skip(); // skip newline
307    }
308
309    #region type-dependent dispatch
310    private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
311      return (list is List<string>) || // all tokens can be added to a string list
312             (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns
313             (tokenType == TokenTypeEnum.Double && list is List<double>) ||
314             (tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
315    }
316
317    // all columns are converted to string columns when we find an non-empty value that has incorrect type
318    private IList ConvertToStringColumn(IList list) {
319      var dblL = list as List<double>;
320      if (dblL != null) {
321        var l = new List<string>(dblL.Capacity);
322        l.AddRange(dblL.Select(dbl => dbl.ToString()));
323        return l;
324      }
325
326      var dtL = list as List<DateTime>;
327      if (dtL != null) {
328        var l = new List<string>(dtL.Capacity);
329        l.AddRange(dtL.Select(dbl => dbl.ToString()));
330        return l;
331      }
332
333      if (list is List<string>) return list;
334
335      throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
336    }
337
338    private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
339      var dblList = list as List<double>;
340      if (dblList != null) {
341        AddValue(type, dblList, dblVal);
342        return;
343      }
344
345      var strList = list as List<string>;
346      if (strList != null) {
347        AddValue(type, strList, strVal);
348        return;
349      }
350      var dtList = list as List<DateTime>;
351      if (dtList != null) {
352        AddValue(type, dtList, dateTimeVal);
353        return;
354      }
355
356      list.Add(strVal); // assumes List<object>
357    }
358
359    private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
360      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double);
361      list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
362    }
363
364    private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
365      // assumes that strVal is always set to the original token read from the input file
366      list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
367    }
368
369    private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
370      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime);
371      list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
372    }
373
374    private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
375      switch (type) {
376        case TokenTypeEnum.String:
377          return new List<string>(estimatedNumberOfLines);
378        case TokenTypeEnum.Double:
379        case TokenTypeEnum.Missing: // assume double columns
380          return new List<double>(estimatedNumberOfLines);
381        case TokenTypeEnum.DateTime:
382          return new List<DateTime>(estimatedNumberOfLines);
383        default:
384          throw new InvalidOperationException();
385      }
386    }
387    #endregion
388
389    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
390      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
391    }
392
393    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
394      using (StreamReader reader = new StreamReader(stream)) {
395        // skip first line
396        reader.ReadLine();
397        // read a block
398        char[] buffer = new char[BUFFER_SIZE];
399        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
400        // count frequency of special characters
401        Dictionary<char, int> charCounts = buffer.Take(charsRead)
402          .GroupBy(c => c)
403          .ToDictionary(g => g.Key, g => g.Count());
404
405        // depending on the characters occuring in the block
406        // we distinghish a number of different cases based on the the following rules:
407        // many points => it must be English number format, the other frequently occuring char is the separator
408        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
409        //   => check the line in more detail:
410        //            English: 0, 0, 0, 0
411        //            German:  0,0 0,0 0,0 ...
412        //            => if commas are followed by space => English format
413        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
414        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
415        if (OccurrencesOf(charCounts, '.') > 10) {
416          numberFormat = NumberFormatInfo.InvariantInfo;
417          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
418          separator = POSSIBLE_SEPARATORS
419            .Where(c => OccurrencesOf(charCounts, c) > 10)
420            .OrderBy(c => -OccurrencesOf(charCounts, c))
421            .DefaultIfEmpty(' ')
422            .First();
423        } else if (OccurrencesOf(charCounts, ',') > 10) {
424          // no points and many commas
425          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
426          int tokensWithMultipleCommas = 0;
427          for (int i = 0; i < charsRead; i++) {
428            int nCommas = 0;
429            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
430              if (buffer[i] == ',') nCommas++;
431              i++;
432            }
433            if (nCommas > 2) tokensWithMultipleCommas++;
434          }
435          if (tokensWithMultipleCommas > 1) {
436            // English format (only integer values) with ',' as separator
437            numberFormat = NumberFormatInfo.InvariantInfo;
438            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
439            separator = ',';
440          } else {
441            char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
442            // German format (real values)
443            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
444            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
445            separator = POSSIBLE_SEPARATORS
446              .Except(disallowedSeparators)
447              .Where(c => OccurrencesOf(charCounts, c) > 10)
448              .OrderBy(c => -OccurrencesOf(charCounts, c))
449              .DefaultIfEmpty(' ')
450              .First();
451          }
452        } else {
453          // no points and no commas => English format
454          numberFormat = NumberFormatInfo.InvariantInfo;
455          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
456          separator = POSSIBLE_SEPARATORS
457            .Where(c => OccurrencesOf(charCounts, c) > 10)
458            .OrderBy(c => -OccurrencesOf(charCounts, c))
459            .DefaultIfEmpty(' ')
460            .First();
461        }
462      }
463    }
464
465    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
466      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
467    }
468
469    #region tokenizer
470    // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
471    internal enum TokenTypeEnum {
472      NewLine, String, Double, DateTime, Missing
473    }
474
475    internal class Tokenizer {
476      private StreamReader reader;
477      // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
478      private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
479      private string[] stringVals = new string[1024];
480      private double[] doubleVals = new double[1024];
481      private DateTime[] dateTimeVals = new DateTime[1024];
482      private int tokenPos;
483      private int numTokens;
484      private NumberFormatInfo numberFormatInfo;
485      private DateTimeFormatInfo dateTimeFormatInfo;
486      private char separator;
487
488      // arrays for string.Split()
489      private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
490      private readonly char[] separators;
491
492      private int currentLineNumber = 0;
493      public int CurrentLineNumber {
494        get { return currentLineNumber; }
495        private set { currentLineNumber = value; }
496      }
497      private string currentLine;
498      public string CurrentLine {
499        get { return currentLine; }
500        private set { currentLine = value; }
501      }
502      public long BytesRead {
503        get;
504        private set;
505      }
506
507      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
508        this.reader = reader;
509        this.numberFormatInfo = numberFormatInfo;
510        this.dateTimeFormatInfo = dateTimeFormatInfo;
511        this.separator = separator;
512        this.separators = new char[] { separator };
513        ReadNextTokens();
514      }
515
516      public bool HasNext() {
517        return numTokens > tokenPos || !reader.EndOfStream;
518      }
519
520      public TokenTypeEnum PeekType() {
521        return tokenTypes[tokenPos];
522      }
523
524      public void Skip() {
525        // simply skips one token without returning the result values
526        tokenPos++;
527        if (numTokens == tokenPos) {
528          ReadNextTokens();
529        }
530      }
531
532      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
533        type = tokenTypes[tokenPos];
534        strVal = stringVals[tokenPos];
535        dblVal = doubleVals[tokenPos];
536        dateTimeVal = dateTimeVals[tokenPos];
537        Skip();
538      }
539
540      private void ReadNextTokens() {
541        if (!reader.EndOfStream) {
542          CurrentLine = reader.ReadLine();
543          CurrentLineNumber++;
544          if (reader.BaseStream.CanSeek) {
545            BytesRead = reader.BaseStream.Position;
546          } else {
547            BytesRead += CurrentLine.Length + 2; // guess
548          }
549          int i = 0;
550          if (!string.IsNullOrWhiteSpace(CurrentLine)) {
551            foreach (var tok in Split(CurrentLine)) {
552              TokenTypeEnum type;
553              double doubleVal;
554              DateTime dateTimeValue;
555              type = TokenTypeEnum.String; // default
556              stringVals[i] = tok.Trim();
557              if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
558                type = TokenTypeEnum.Double;
559                doubleVals[i] = doubleVal;
560              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue)
561                && dateTimeValue.Year > 1 && dateTimeValue.Month > 1 && dateTimeValue.Day > 1 // if no date is given it is returned as 1.1.0001 -> don't allow this
562                ) {
563                type = TokenTypeEnum.DateTime;
564                dateTimeVals[i] = dateTimeValue;
565              } else if (string.IsNullOrWhiteSpace(tok)) {
566                type = TokenTypeEnum.Missing;
567              }
568
569              // couldn't parse the token as an int or float number or datetime value so return a string token
570
571              tokenTypes[i] = type;
572              i++;
573
574              if (i >= tokenTypes.Length) {
575                // increase buffer size if necessary
576                IncreaseCapacity(ref tokenTypes);
577                IncreaseCapacity(ref doubleVals);
578                IncreaseCapacity(ref stringVals);
579                IncreaseCapacity(ref dateTimeVals);
580              }
581            }
582          }
583          tokenTypes[i] = TokenTypeEnum.NewLine;
584          numTokens = i + 1;
585          tokenPos = 0;
586        }
587      }
588
589      private IEnumerable<string> Split(string line) {
590        return separator == WHITESPACECHAR ?
591          line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
592          line.Split(separators);
593      }
594
595      private static void IncreaseCapacity<T>(ref T[] arr) {
596        int n = (int)Math.Floor(arr.Length * 1.7); // guess
597        T[] arr2 = new T[n];
598        Array.Copy(arr, arr2, arr.Length);
599        arr = arr2;
600      }
601    }
602    #endregion
603
604    #region parsing
605
606    private void ParseVariableNames() {
607      // the first line must contain variable names
608      List<string> varNames = new List<string>();
609
610      TokenTypeEnum type;
611      string strVal;
612      double dblVal;
613      DateTime dateTimeVal;
614
615      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
616
617      // the first token must be a variable name
618      if (type != TokenTypeEnum.String)
619        throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
620      varNames.Add(strVal);
621
622      while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
623        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
624        varNames.Add(strVal);
625      }
626      ExpectType(TokenTypeEnum.NewLine);
627
628      variableNames = varNames;
629    }
630
631    private void ExpectType(TokenTypeEnum expectedToken) {
632      if (tokenizer.PeekType() != expectedToken)
633        throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
634      tokenizer.Skip();
635    }
636
637    private void Error(string message, string token, int lineNumber) {
638      throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber));
639    }
640    #endregion
641  }
642}
Note: See TracBrowser for help on using the repository browser.