source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 13526

Last change on this file since 13526 was 13526, checked in by gkronber, 5 years ago

#2071 added code for type conversion of columns to the table file parser and made some other minor changes

File size: 26.3 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Diagnostics.Contracts;
27using System.Globalization;
28using System.IO;
29using System.Linq;
30using System.Runtime;
31using System.Runtime.Serialization;
32using System.Text;
33
34namespace HeuristicLab.Problems.Instances.DataAnalysis {
35  public class TableFileParser : Progress<long> { // reports the number of bytes read
36    private const int BUFFER_SIZE = 65536;
37    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
38    private const char WHITESPACECHAR = (char)0;
39    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
40    private Tokenizer tokenizer;
41    private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
42
43    private int rows;
44    public int Rows {
45      get { return rows; }
46      set { rows = value; }
47    }
48
49    private int columns;
50    public int Columns {
51      get { return columns; }
52      set { columns = value; }
53    }
54
55    private List<IList> values;
56    public List<IList> Values {
57      get {
58        return values;
59      }
60    }
61
62    private List<string> variableNames;
63    public IEnumerable<string> VariableNames {
64      get {
65        if (variableNames.Count > 0) return variableNames;
66        else {
67          string[] names = new string[columns];
68          for (int i = 0; i < names.Length; i++) {
69            names[i] = "X" + i.ToString("000");
70          }
71          return names;
72        }
73      }
74    }
75
76    public TableFileParser() {
77      variableNames = new List<string>();
78    }
79
80    public bool AreColumnNamesInFirstLine(string fileName) {
81      NumberFormatInfo numberFormat;
82      DateTimeFormatInfo dateTimeFormatInfo;
83      char separator;
84      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
85      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
86        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
87      }
88    }
89
90    public bool AreColumnNamesInFirstLine(Stream stream) {
91      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
92      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
93      char separator = ',';
94      return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
95    }
96
97    public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
98                                         DateTimeFormatInfo dateTimeFormatInfo, char separator) {
99      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
100        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
101      }
102    }
103
104    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
105                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
106      using (StreamReader reader = new StreamReader(stream)) {
107        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
108        return (tokenizer.PeekType() != TokenTypeEnum.Double);
109      }
110    }
111
112    /// <summary>
113    /// Parses a file and determines the format first
114    /// </summary>
115    /// <param name="fileName">file which is parsed</param>
116    /// <param name="columnNamesInFirstLine"></param>
117    public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
118      NumberFormatInfo numberFormat;
119      DateTimeFormatInfo dateTimeFormatInfo;
120      char separator;
121      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
122      EstimateNumberOfLines(fileName);
123      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
124    }
125
126    /// <summary>
127    /// Parses a file with the given formats
128    /// </summary>
129    /// <param name="fileName">file which is parsed</param>
130    /// <param name="numberFormat">Format of numbers</param>
131    /// <param name="dateTimeFormatInfo">Format of datetime</param>
132    /// <param name="separator">defines the separator</param>
133    /// <param name="columnNamesInFirstLine"></param>
134    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
135      EstimateNumberOfLines(fileName);
136      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
137        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
138      }
139    }
140
141    // determines the number of newline characters in the first 64KB to guess the number of rows for a file
142    private void EstimateNumberOfLines(string fileName) {
143      var len = new System.IO.FileInfo(fileName).Length;
144      var buf = new char[1024 * 1024];
145      using (var reader = new StreamReader(fileName)) {
146        reader.ReadBlock(buf, 0, buf.Length);
147      }
148      int numNewLine = 0;
149      int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
150      foreach (var ch in buf) {
151        charsInCurrentLine++;
152        if (ch == '\n') {
153          if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
154          charsInCurrentLine = 0;
155          numNewLine++;
156        }
157      }
158      if (numNewLine <= 1) {
159        // fail -> keep the default setting
160        return;
161      } else {
162        double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
163        double estimatedLines = len / charsPerLineFactor;
164        estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
165      }
166    }
167
168    /// <summary>
169    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
170    /// </summary>
171    /// <param name="stream">stream which is parsed</param>
172    /// <param name="columnNamesInFirstLine"></param>
173    public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
174      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
175      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
176      char separator = ',';
177      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
178    }
179
180    /// <summary>
181    /// Parses a stream with the given formats.
182    /// </summary>
183    /// <param name="stream">Stream which is parsed</param>   
184    /// <param name="numberFormat">Format of numbers</param>
185    /// <param name="dateTimeFormatInfo">Format of datetime</param>
186    /// <param name="separator">defines the separator</param>
187    /// <param name="columnNamesInFirstLine"></param>
188    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
189      using (StreamReader reader = new StreamReader(stream)) {
190        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
191        values = new List<IList>();
192        if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
193
194        if (columnNamesInFirstLine) {
195          ParseVariableNames();
196          if (!tokenizer.HasNext())
197            Error(
198              "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
199              "", tokenizer.CurrentLineNumber);
200        }
201
202
203        // read values... start in first row
204        int nLinesParsed = 0;
205        int colIdx = 0;
206        int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
207        while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
208          if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
209            tokenizer.Skip();
210
211            // all rows have to have the same number of values
212            // the first row defines how many samples are needed
213            if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
214            else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
215              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
216                    "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
217                    tokenizer.CurrentLineNumber);
218            }
219            OnReport(tokenizer.BytesRead);
220
221            nLinesParsed++;
222            colIdx = 0;
223          } else {
224            // read one value
225            TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
226            tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
227
228            // initialize columns on the first row (fixing data types as presented in the first row...)
229            if (nLinesParsed == 0) {
230              values.Add(CreateList(type, estimatedNumberOfLines));
231            } else if (colIdx == values.Count) {
232              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
233                    "Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
234                tokenizer.CurrentLineNumber);
235            }
236            if (!IsColumnTypeCompatible(values[colIdx], type)) {
237              values[colIdx] = ConvertToStringColumn(values[colIdx]);
238            }
239            // add the value to the column
240            AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
241          }
242        }
243
244        if (!values.Any() || values.First().Count == 0)
245          Error("Couldn't parse data values. Probably because of incorrect number format " +
246                "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
247      }
248
249      this.rows = values.First().Count;
250      this.columns = values.Count;
251
252      // after everything has been parsed make sure the lists are as compact as possible
253      foreach (var l in values) {
254        var dblList = l as List<double>;
255        var byteList = l as List<byte>;
256        var dateList = l as List<DateTime>;
257        var stringList = l as List<string>;
258        var objList = l as List<object>;
259        if (dblList != null) dblList.TrimExcess();
260        if (byteList != null) byteList.TrimExcess();
261        if (dateList != null) dateList.TrimExcess();
262        if (stringList != null) stringList.TrimExcess();
263        if (objList != null) objList.TrimExcess();
264      }
265
266      // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
267      GC.Collect(2, GCCollectionMode.Forced);
268    }
269
270    #region type-dependent dispatch
271    private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
272      return (list is List<string>) || // all tokens can be added to a string list
273             (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns
274             (tokenType == TokenTypeEnum.Double && list is List<double>) ||
275             (tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
276    }
277
278    // all columns are converted to string columns when we find an non-empty value that has incorrect type
279    private IList ConvertToStringColumn(IList list) {
280      var dblL = list as List<double>;
281      if (dblL != null) {
282        var l = new List<string>(dblL.Capacity);
283        l.AddRange(dblL.Select(dbl => dbl.ToString()));
284        return l;
285      }
286
287      var dtL = list as List<DateTime>;
288      if (dtL != null) {
289        var l = new List<string>(dtL.Capacity);
290        l.AddRange(dtL.Select(dbl => dbl.ToString()));
291        return l;
292      }
293
294      if (list is List<string>) return list;
295
296      throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
297    }
298
299    private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
300      var dblList = list as List<double>;
301      if (dblList != null) {
302        AddValue(type, dblList, dblVal);
303        return;
304      }
305
306      var strList = list as List<string>;
307      if (strList != null) {
308        AddValue(type, strList, strVal);
309        return;
310      }
311      var dtList = list as List<DateTime>;
312      if (dtList != null) {
313        AddValue(type, dtList, dateTimeVal);
314        return;
315      }
316
317      list.Add(strVal); // assumes List<object>
318    }
319
320    private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
321      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double);
322      list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
323    }
324
325    private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
326      // assumes that strVal is always set to the original token read from the input file
327      list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
328    }
329
330    private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
331      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime);
332      list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
333    }
334
335    private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
336      switch (type) {
337        case TokenTypeEnum.String:
338          return new List<string>(estimatedNumberOfLines);
339        case TokenTypeEnum.Double:
340        case TokenTypeEnum.Missing: // assume double columns
341          return new List<double>(estimatedNumberOfLines);
342        case TokenTypeEnum.DateTime:
343          return new List<DateTime>(estimatedNumberOfLines);
344        default:
345          throw new InvalidOperationException();
346      }
347    }
348    #endregion
349
350    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
351      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
352    }
353
354    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
355      using (StreamReader reader = new StreamReader(stream)) {
356        // skip first line
357        reader.ReadLine();
358        // read a block
359        char[] buffer = new char[BUFFER_SIZE];
360        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
361        // count frequency of special characters
362        Dictionary<char, int> charCounts = buffer.Take(charsRead)
363          .GroupBy(c => c)
364          .ToDictionary(g => g.Key, g => g.Count());
365
366        // depending on the characters occuring in the block
367        // we distinghish a number of different cases based on the the following rules:
368        // many points => it must be English number format, the other frequently occuring char is the separator
369        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
370        //   => check the line in more detail:
371        //            English: 0, 0, 0, 0
372        //            German:  0,0 0,0 0,0 ...
373        //            => if commas are followed by space => English format
374        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
375        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
376        if (OccurrencesOf(charCounts, '.') > 10) {
377          numberFormat = NumberFormatInfo.InvariantInfo;
378          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
379          separator = POSSIBLE_SEPARATORS
380            .Where(c => OccurrencesOf(charCounts, c) > 10)
381            .OrderBy(c => -OccurrencesOf(charCounts, c))
382            .DefaultIfEmpty(' ')
383            .First();
384        } else if (OccurrencesOf(charCounts, ',') > 10) {
385          // no points and many commas
386          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
387          int tokensWithMultipleCommas = 0;
388          for (int i = 0; i < charsRead; i++) {
389            int nCommas = 0;
390            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
391              if (buffer[i] == ',') nCommas++;
392              i++;
393            }
394            if (nCommas > 2) tokensWithMultipleCommas++;
395          }
396          if (tokensWithMultipleCommas > 1) {
397            // English format (only integer values) with ',' as separator
398            numberFormat = NumberFormatInfo.InvariantInfo;
399            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
400            separator = ',';
401          } else {
402            char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
403            // German format (real values)
404            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
405            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
406            separator = POSSIBLE_SEPARATORS
407              .Except(disallowedSeparators)
408              .Where(c => OccurrencesOf(charCounts, c) > 10)
409              .OrderBy(c => -OccurrencesOf(charCounts, c))
410              .DefaultIfEmpty(' ')
411              .First();
412          }
413        } else {
414          // no points and no commas => English format
415          numberFormat = NumberFormatInfo.InvariantInfo;
416          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
417          separator = POSSIBLE_SEPARATORS
418            .Where(c => OccurrencesOf(charCounts, c) > 10)
419            .OrderBy(c => -OccurrencesOf(charCounts, c))
420            .DefaultIfEmpty(' ')
421            .First();
422        }
423      }
424    }
425
426    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
427      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
428    }
429
430    #region tokenizer
431    // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
432    internal enum TokenTypeEnum {
433      NewLine, String, Double, DateTime, Missing
434    }
435
436    internal class Tokenizer {
437      private StreamReader reader;
438      // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
439      private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
440      private string[] stringVals = new string[1024];
441      private double[] doubleVals = new double[1024];
442      private DateTime[] dateTimeVals = new DateTime[1024];
443      private int tokenPos;
444      private int numTokens;
445      private NumberFormatInfo numberFormatInfo;
446      private DateTimeFormatInfo dateTimeFormatInfo;
447      private char separator;
448
449      // arrays for string.Split()
450      private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
451      private readonly char[] separators;
452
453      private int currentLineNumber = 0;
454      public int CurrentLineNumber {
455        get { return currentLineNumber; }
456        private set { currentLineNumber = value; }
457      }
458      private string currentLine;
459      public string CurrentLine {
460        get { return currentLine; }
461        private set { currentLine = value; }
462      }
463      public long BytesRead {
464        get;
465        private set;
466      }
467
468      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
469        this.reader = reader;
470        this.numberFormatInfo = numberFormatInfo;
471        this.dateTimeFormatInfo = dateTimeFormatInfo;
472        this.separator = separator;
473        this.separators = new char[] { separator };
474        ReadNextTokens();
475      }
476
477      public bool HasNext() {
478        return numTokens > tokenPos || !reader.EndOfStream;
479      }
480
481      public TokenTypeEnum PeekType() {
482        return tokenTypes[tokenPos];
483      }
484
485      public void Skip() {
486        // simply skips one token without returning the result values
487        tokenPos++;
488        if (numTokens == tokenPos) {
489          ReadNextTokens();
490        }
491      }
492
493      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
494        type = tokenTypes[tokenPos];
495        strVal = stringVals[tokenPos];
496        dblVal = doubleVals[tokenPos];
497        dateTimeVal = dateTimeVals[tokenPos];
498        Skip();
499      }
500
501      private void ReadNextTokens() {
502        if (!reader.EndOfStream) {
503          CurrentLine = reader.ReadLine();
504          CurrentLineNumber++;
505          try {
506            BytesRead = reader.BaseStream.Position;
507          } catch (IOException) {
508            BytesRead += CurrentLine.Length + 2; // guess
509          } catch (NotSupportedException) {
510            BytesRead += CurrentLine.Length + 2;
511          }
512          int i = 0;
513          if (!string.IsNullOrWhiteSpace(CurrentLine)) {
514            foreach (var tok in Split(CurrentLine)) {
515              TokenTypeEnum type;
516              double doubleVal;
517              DateTime dateTimeValue;
518              type = TokenTypeEnum.String; // default
519              stringVals[i] = tok.Trim();
520              if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
521                type = TokenTypeEnum.Double;
522                doubleVals[i] = doubleVal;
523              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
524                type = TokenTypeEnum.DateTime;
525                dateTimeVals[i] = dateTimeValue;
526              } else if (string.IsNullOrWhiteSpace(tok)) {
527                type = TokenTypeEnum.Missing;
528              }
529
530              // couldn't parse the token as an int or float number or datetime value so return a string token
531
532              tokenTypes[i] = type;
533              i++;
534
535              if (i >= tokenTypes.Length) {
536                // increase buffer size if necessary
537                IncreaseCapacity(ref tokenTypes);
538                IncreaseCapacity(ref doubleVals);
539                IncreaseCapacity(ref stringVals);
540                IncreaseCapacity(ref dateTimeVals);
541              }
542            }
543          }
544          tokenTypes[i] = TokenTypeEnum.NewLine;
545          numTokens = i + 1;
546          tokenPos = 0;
547        }
548      }
549
550      private IEnumerable<string> Split(string line) {
551        return separator == WHITESPACECHAR ?
552          line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
553          line.Split(separators);
554      }
555
556      private static void IncreaseCapacity<T>(ref T[] arr) {
557        int n = (int)Math.Floor(arr.Length * 1.7); // guess
558        T[] arr2 = new T[n];
559        Array.Copy(arr, arr2, arr.Length);
560        arr = arr2;
561      }
562    }
563    #endregion
564
565    #region parsing
566
567    private void ParseVariableNames() {
568      // the first line must contain variable names
569      List<string> varNames = new List<string>();
570
571      TokenTypeEnum type;
572      string strVal;
573      double dblVal;
574      DateTime dateTimeVal;
575
576      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
577
578      // the first token must be a variable name
579      if (type != TokenTypeEnum.String)
580        throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
581      varNames.Add(strVal);
582
583      while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
584        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
585        varNames.Add(strVal);
586      }
587      ExpectType(TokenTypeEnum.NewLine);
588
589      variableNames = varNames;
590    }
591
592    private void ExpectType(TokenTypeEnum expectedToken) {
593      if (tokenizer.PeekType() != expectedToken)
594        throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
595      tokenizer.Skip();
596    }
597
598    private void Error(string message, string token, int lineNumber) {
599      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
600    }
601    #endregion
602
603    [Serializable]
604    public class DataFormatException : Exception {
605      private int line;
606      public int Line {
607        get { return line; }
608      }
609      private string token;
610      public string Token {
611        get { return token; }
612      }
613      public DataFormatException(string message, string token, int line)
614        : base(message + "\nToken: " + token + " (line: " + line + ")") {
615        this.token = token;
616        this.line = line;
617      }
618
619      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
620    }
621  }
622}
Note: See TracBrowser for help on using the repository browser.