Free cookie consent management tool by TermsFeed Policy Generator

source: branches/crossvalidation-2434/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 15318

Last change on this file since 15318 was 14029, checked in by gkronber, 9 years ago

#2434: merged trunk changes r12934:14026 from trunk to branch

File size: 26.4 KB
RevLine 
[7849]1#region License Information
2/* HeuristicLab
[12012]3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[7849]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
[14029]26using System.Diagnostics.Contracts;
[7849]27using System.Globalization;
28using System.IO;
29using System.Linq;
30using System.Runtime.Serialization;
[14029]31using System.Text;
[7849]32
33namespace HeuristicLab.Problems.Instances.DataAnalysis {
[14029]34  public class TableFileParser : Progress<long> { // reports the number of bytes read
[8564]35    private const int BUFFER_SIZE = 65536;
[9652]36    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
37    private const char WHITESPACECHAR = (char)0;
38    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
[7849]39    private Tokenizer tokenizer;
[14029]40    private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
[7849]41
[14029]42
43    private Encoding encoding = Encoding.Default;
44
45    public Encoding Encoding {
46      get { return encoding; }
47      set {
48        if (value == null) throw new ArgumentNullException("Encoding");
49        encoding = value;
50      }
51    }
52
53
[7849]54    private int rows;
55    public int Rows {
56      get { return rows; }
57      set { rows = value; }
58    }
59
60    private int columns;
61    public int Columns {
62      get { return columns; }
63      set { columns = value; }
64    }
65
66    private List<IList> values;
67    public List<IList> Values {
68      get {
69        return values;
70      }
71    }
72
73    private List<string> variableNames;
74    public IEnumerable<string> VariableNames {
75      get {
76        if (variableNames.Count > 0) return variableNames;
77        else {
78          string[] names = new string[columns];
79          for (int i = 0; i < names.Length; i++) {
80            names[i] = "X" + i.ToString("000");
81          }
82          return names;
83        }
84      }
85    }
86
87    public TableFileParser() {
88      variableNames = new List<string>();
89    }
90
[9608]91    public bool AreColumnNamesInFirstLine(string fileName) {
92      NumberFormatInfo numberFormat;
93      DateTimeFormatInfo dateTimeFormatInfo;
94      char separator;
95      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
96      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
97        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
98      }
99    }
100
101    public bool AreColumnNamesInFirstLine(Stream stream) {
102      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
103      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
104      char separator = ',';
105      return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
106    }
107
108    public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
109                                         DateTimeFormatInfo dateTimeFormatInfo, char separator) {
110      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
111        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
112      }
113    }
114
115    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
116                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
[14029]117      using (StreamReader reader = new StreamReader(stream, Encoding)) {
[9608]118        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[14029]119        return (tokenizer.PeekType() != TokenTypeEnum.Double);
[9608]120      }
121    }
122
[7851]123    /// <summary>
124    /// Parses a file and determines the format first
125    /// </summary>
126    /// <param name="fileName">file which is parsed</param>
[9608]127    /// <param name="columnNamesInFirstLine"></param>
[14029]128    public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]129      NumberFormatInfo numberFormat;
130      DateTimeFormatInfo dateTimeFormatInfo;
131      char separator;
[9608]132      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
[14029]133      EstimateNumberOfLines(fileName);
134      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]135    }
136
137    /// <summary>
138    /// Parses a file with the given formats
139    /// </summary>
140    /// <param name="fileName">file which is parsed</param>
141    /// <param name="numberFormat">Format of numbers</param>
142    /// <param name="dateTimeFormatInfo">Format of datetime</param>
143    /// <param name="separator">defines the separator</param>
[9608]144    /// <param name="columnNamesInFirstLine"></param>
[14029]145    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
146      EstimateNumberOfLines(fileName);
[9608]147      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
[14029]148        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[9608]149      }
[7849]150    }
151
[14029]152    // determines the number of newline characters in the first 64KB to guess the number of rows for a file
153    private void EstimateNumberOfLines(string fileName) {
154      var len = new System.IO.FileInfo(fileName).Length;
155      var buf = new char[1024 * 1024];
156      using (var reader = new StreamReader(fileName, Encoding)) {
157        reader.ReadBlock(buf, 0, buf.Length);
158      }
159      int numNewLine = 0;
160      int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
161      foreach (var ch in buf) {
162        charsInCurrentLine++;
163        if (ch == '\n') {
164          if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
165          charsInCurrentLine = 0;
166          numNewLine++;
167        }
168      }
169      if (numNewLine <= 1) {
170        // fail -> keep the default setting
171        return;
172      } else {
173        double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
174        double estimatedLines = len / charsPerLineFactor;
175        estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
176      }
177    }
178
[7851]179    /// <summary>
180    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
181    /// </summary>
182    /// <param name="stream">stream which is parsed</param>
[9608]183    /// <param name="columnNamesInFirstLine"></param>
[14029]184    public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]185      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
186      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
187      char separator = ',';
[14029]188      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]189    }
190
191    /// <summary>
192    /// Parses a stream with the given formats.
193    /// </summary>
194    /// <param name="stream">Stream which is parsed</param>   
195    /// <param name="numberFormat">Format of numbers</param>
196    /// <param name="dateTimeFormatInfo">Format of datetime</param>
197    /// <param name="separator">defines the separator</param>
[9608]198    /// <param name="columnNamesInFirstLine"></param>
[14029]199    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
200      using (StreamReader reader = new StreamReader(stream, Encoding)) {
[7849]201        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[14029]202        values = new List<IList>();
203        if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
[7849]204
[14029]205        if (columnNamesInFirstLine) {
206          ParseVariableNames();
207          if (!tokenizer.HasNext())
208            Error(
209              "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
210              "", tokenizer.CurrentLineNumber);
211        }
[7849]212
[14029]213
214        // read values... start in first row
215        int nLinesParsed = 0;
216        int colIdx = 0;
217        int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
218        while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
219          if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
220            tokenizer.Skip();
221
222            // all rows have to have the same number of values
223            // the first row defines how many samples are needed
224            if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
225            else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
226              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
227                    "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
228                    tokenizer.CurrentLineNumber);
229            }
230            OnReport(tokenizer.BytesRead);
231
232            nLinesParsed++;
233            colIdx = 0;
234          } else {
235            // read one value
236            TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
237            tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
238
239            // initialize columns on the first row (fixing data types as presented in the first row...)
240            if (nLinesParsed == 0) {
241              values.Add(CreateList(type, estimatedNumberOfLines));
242            } else if (colIdx == values.Count) {
243              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
244                    "Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
245                tokenizer.CurrentLineNumber);
246            }
247            if (!IsColumnTypeCompatible(values[colIdx], type)) {
248              values[colIdx] = ConvertToStringColumn(values[colIdx]);
249            }
250            // add the value to the column
251            AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
252          }
[7849]253        }
254
[14029]255        if (!values.Any() || values.First().Count == 0)
256          Error("Couldn't parse data values. Probably because of incorrect number format " +
257                "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[7849]258      }
259
[14029]260      this.rows = values.First().Count;
261      this.columns = values.Count;
[7849]262
[14029]263      // after everything has been parsed make sure the lists are as compact as possible
264      foreach (var l in values) {
265        var dblList = l as List<double>;
266        var byteList = l as List<byte>;
267        var dateList = l as List<DateTime>;
268        var stringList = l as List<string>;
269        var objList = l as List<object>;
270        if (dblList != null) dblList.TrimExcess();
271        if (byteList != null) byteList.TrimExcess();
272        if (dateList != null) dateList.TrimExcess();
273        if (stringList != null) stringList.TrimExcess();
274        if (objList != null) objList.TrimExcess();
275      }
[7849]276
[14029]277      // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
278      GC.Collect(2, GCCollectionMode.Forced);
279    }
280
281    #region type-dependent dispatch
282    private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
283      return (list is List<string>) || // all tokens can be added to a string list
284             (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns
285             (tokenType == TokenTypeEnum.Double && list is List<double>) ||
286             (tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
287    }
288
289    // all columns are converted to string columns when we find an non-empty value that has incorrect type
290    private IList ConvertToStringColumn(IList list) {
291      var dblL = list as List<double>;
292      if (dblL != null) {
293        var l = new List<string>(dblL.Capacity);
294        l.AddRange(dblL.Select(dbl => dbl.ToString()));
295        return l;
[7849]296      }
[14029]297
298      var dtL = list as List<DateTime>;
299      if (dtL != null) {
300        var l = new List<string>(dtL.Capacity);
301        l.AddRange(dtL.Select(dbl => dbl.ToString()));
302        return l;
303      }
304
305      if (list is List<string>) return list;
306
307      throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
[7849]308    }
309
[14029]310    private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
311      var dblList = list as List<double>;
312      if (dblList != null) {
313        AddValue(type, dblList, dblVal);
314        return;
315      }
316
317      var strList = list as List<string>;
318      if (strList != null) {
319        AddValue(type, strList, strVal);
320        return;
321      }
322      var dtList = list as List<DateTime>;
323      if (dtList != null) {
324        AddValue(type, dtList, dateTimeVal);
325        return;
326      }
327
328      list.Add(strVal); // assumes List<object>
329    }
330
331    private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
332      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double);
333      list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
334    }
335
336    private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
337      // assumes that strVal is always set to the original token read from the input file
338      list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
339    }
340
341    private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
342      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime);
343      list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
344    }
345
346    private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
347      switch (type) {
348        case TokenTypeEnum.String:
349          return new List<string>(estimatedNumberOfLines);
350        case TokenTypeEnum.Double:
351        case TokenTypeEnum.Missing: // assume double columns
352          return new List<double>(estimatedNumberOfLines);
353        case TokenTypeEnum.DateTime:
354          return new List<DateTime>(estimatedNumberOfLines);
355        default:
356          throw new InvalidOperationException();
357      }
358    }
359    #endregion
360
[7849]361    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
[8885]362      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
[7849]363    }
364
365    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
366      using (StreamReader reader = new StreamReader(stream)) {
367        // skip first line
368        reader.ReadLine();
369        // read a block
370        char[] buffer = new char[BUFFER_SIZE];
371        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
372        // count frequency of special characters
373        Dictionary<char, int> charCounts = buffer.Take(charsRead)
374          .GroupBy(c => c)
375          .ToDictionary(g => g.Key, g => g.Count());
376
377        // depending on the characters occuring in the block
378        // we distinghish a number of different cases based on the the following rules:
379        // many points => it must be English number format, the other frequently occuring char is the separator
380        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
381        //   => check the line in more detail:
382        //            English: 0, 0, 0, 0
383        //            German:  0,0 0,0 0,0 ...
384        //            => if commas are followed by space => English format
385        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
386        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
387        if (OccurrencesOf(charCounts, '.') > 10) {
388          numberFormat = NumberFormatInfo.InvariantInfo;
389          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
390          separator = POSSIBLE_SEPARATORS
391            .Where(c => OccurrencesOf(charCounts, c) > 10)
392            .OrderBy(c => -OccurrencesOf(charCounts, c))
393            .DefaultIfEmpty(' ')
394            .First();
395        } else if (OccurrencesOf(charCounts, ',') > 10) {
396          // no points and many commas
397          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
398          int tokensWithMultipleCommas = 0;
399          for (int i = 0; i < charsRead; i++) {
400            int nCommas = 0;
401            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
402              if (buffer[i] == ',') nCommas++;
403              i++;
404            }
405            if (nCommas > 2) tokensWithMultipleCommas++;
406          }
407          if (tokensWithMultipleCommas > 1) {
408            // English format (only integer values) with ',' as separator
409            numberFormat = NumberFormatInfo.InvariantInfo;
410            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
411            separator = ',';
412          } else {
[14029]413            char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
[7849]414            // German format (real values)
415            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
416            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
417            separator = POSSIBLE_SEPARATORS
418              .Except(disallowedSeparators)
419              .Where(c => OccurrencesOf(charCounts, c) > 10)
420              .OrderBy(c => -OccurrencesOf(charCounts, c))
421              .DefaultIfEmpty(' ')
422              .First();
423          }
424        } else {
425          // no points and no commas => English format
426          numberFormat = NumberFormatInfo.InvariantInfo;
427          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
428          separator = POSSIBLE_SEPARATORS
429            .Where(c => OccurrencesOf(charCounts, c) > 10)
430            .OrderBy(c => -OccurrencesOf(charCounts, c))
431            .DefaultIfEmpty(' ')
432            .First();
433        }
434      }
435    }
436
437    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
438      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
439    }
440
441    #region tokenizer
[14029]442    // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
[7849]443    internal enum TokenTypeEnum {
[14029]444      NewLine, String, Double, DateTime, Missing
[7849]445    }
446
447    internal class Tokenizer {
448      private StreamReader reader;
[14029]449      // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
450      private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
451      private string[] stringVals = new string[1024];
452      private double[] doubleVals = new double[1024];
453      private DateTime[] dateTimeVals = new DateTime[1024];
454      private int tokenPos;
455      private int numTokens;
[7849]456      private NumberFormatInfo numberFormatInfo;
457      private DateTimeFormatInfo dateTimeFormatInfo;
458      private char separator;
459
[14029]460      // arrays for string.Split()
461      private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
462      private readonly char[] separators;
463
[7849]464      private int currentLineNumber = 0;
465      public int CurrentLineNumber {
466        get { return currentLineNumber; }
467        private set { currentLineNumber = value; }
468      }
469      private string currentLine;
470      public string CurrentLine {
471        get { return currentLine; }
472        private set { currentLine = value; }
473      }
[14029]474      public long BytesRead {
475        get;
476        private set;
[7849]477      }
478
479      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
480        this.reader = reader;
481        this.numberFormatInfo = numberFormatInfo;
482        this.dateTimeFormatInfo = dateTimeFormatInfo;
483        this.separator = separator;
[14029]484        this.separators = new char[] { separator };
[7849]485        ReadNextTokens();
486      }
487
[14029]488      public bool HasNext() {
489        return numTokens > tokenPos || !reader.EndOfStream;
[7849]490      }
491
[14029]492      public TokenTypeEnum PeekType() {
493        return tokenTypes[tokenPos];
[7849]494      }
495
[14029]496      public void Skip() {
497        // simply skips one token without returning the result values
498        tokenPos++;
499        if (numTokens == tokenPos) {
500          ReadNextTokens();
[7849]501        }
502      }
503
[14029]504      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
505        type = tokenTypes[tokenPos];
506        strVal = stringVals[tokenPos];
507        dblVal = doubleVals[tokenPos];
508        dateTimeVal = dateTimeVals[tokenPos];
509        Skip();
[7849]510      }
511
[14029]512      private void ReadNextTokens() {
513        if (!reader.EndOfStream) {
514          CurrentLine = reader.ReadLine();
515          CurrentLineNumber++;
516          if (reader.BaseStream.CanSeek) {
517            BytesRead = reader.BaseStream.Position;
518          } else {
519            BytesRead += CurrentLine.Length + 2; // guess
520          }
521          int i = 0;
522          if (!string.IsNullOrWhiteSpace(CurrentLine)) {
523            foreach (var tok in Split(CurrentLine)) {
524              TokenTypeEnum type;
525              double doubleVal;
526              DateTime dateTimeValue;
527              type = TokenTypeEnum.String; // default
528              stringVals[i] = tok.Trim();
529              if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
530                type = TokenTypeEnum.Double;
531                doubleVals[i] = doubleVal;
532              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
533                type = TokenTypeEnum.DateTime;
534                dateTimeVals[i] = dateTimeValue;
535              } else if (string.IsNullOrWhiteSpace(tok)) {
536                type = TokenTypeEnum.Missing;
537              }
[7849]538
[14029]539              // couldn't parse the token as an int or float number or datetime value so return a string token
[7849]540
[14029]541              tokenTypes[i] = type;
542              i++;
[7849]543
[14029]544              if (i >= tokenTypes.Length) {
545                // increase buffer size if necessary
546                IncreaseCapacity(ref tokenTypes);
547                IncreaseCapacity(ref doubleVals);
548                IncreaseCapacity(ref stringVals);
549                IncreaseCapacity(ref dateTimeVals);
550              }
551            }
[7849]552          }
[14029]553          tokenTypes[i] = TokenTypeEnum.NewLine;
554          numTokens = i + 1;
555          tokenPos = 0;
[7849]556        }
557      }
558
[14029]559      private IEnumerable<string> Split(string line) {
560        return separator == WHITESPACECHAR ?
561          line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
562          line.Split(separators);
[7849]563      }
[14029]564
565      private static void IncreaseCapacity<T>(ref T[] arr) {
566        int n = (int)Math.Floor(arr.Length * 1.7); // guess
567        T[] arr2 = new T[n];
568        Array.Copy(arr, arr2, arr.Length);
569        arr = arr2;
570      }
[7849]571    }
[14029]572    #endregion
[7849]573
[14029]574    #region parsing
575
[7849]576    private void ParseVariableNames() {
577      // the first line must contain variable names
[14029]578      List<string> varNames = new List<string>();
579
580      TokenTypeEnum type;
581      string strVal;
582      double dblVal;
583      DateTime dateTimeVal;
584
585      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
586
587      // the first token must be a variable name
588      if (type != TokenTypeEnum.String)
589        throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
590      varNames.Add(strVal);
591
592      while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
593        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
594        varNames.Add(strVal);
[7849]595      }
[14029]596      ExpectType(TokenTypeEnum.NewLine);
597
598      variableNames = varNames;
[7849]599    }
600
[14029]601    private void ExpectType(TokenTypeEnum expectedToken) {
602      if (tokenizer.PeekType() != expectedToken)
603        throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
604      tokenizer.Skip();
[7849]605    }
606
607    private void Error(string message, string token, int lineNumber) {
608      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
609    }
610    #endregion
611
612    [Serializable]
[9449]613    public class DataFormatException : Exception {
[7849]614      private int line;
615      public int Line {
616        get { return line; }
617      }
618      private string token;
619      public string Token {
620        get { return token; }
621      }
622      public DataFormatException(string message, string token, int line)
623        : base(message + "\nToken: " + token + " (line: " + line + ")") {
624        this.token = token;
625        this.line = line;
626      }
627
628      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
629    }
630  }
631}
Note: See TracBrowser for help on using the repository browser.