Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 13445

Last change on this file since 13445 was 13445, checked in by gkronber, 9 years ago

#2071: corrected disposal of StreamReader

File size: 25.2 KB
RevLine 
[7849]1#region License Information
2/* HeuristicLab
[12012]3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[7849]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22
23using System;
24using System.Collections;
25using System.Collections.Generic;
26using System.Globalization;
27using System.IO;
28using System.Linq;
[13442]29using System.Runtime;
[7849]30using System.Runtime.Serialization;
[13440]31using System.Text;
[7849]32
33namespace HeuristicLab.Problems.Instances.DataAnalysis {
[13414]34  public class TableFileParser : Progress<long> { // reports the number of bytes read
[8564]35    private const int BUFFER_SIZE = 65536;
[9652]36    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
37    private const char WHITESPACECHAR = (char)0;
38    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
[7849]39    private Tokenizer tokenizer;
[13440]40    private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
[7849]41
42    private int rows;
43    public int Rows {
44      get { return rows; }
45      set { rows = value; }
46    }
47
48    private int columns;
49    public int Columns {
50      get { return columns; }
51      set { columns = value; }
52    }
53
54    private List<IList> values;
55    public List<IList> Values {
56      get {
57        return values;
58      }
59    }
60
61    private List<string> variableNames;
62    public IEnumerable<string> VariableNames {
63      get {
64        if (variableNames.Count > 0) return variableNames;
65        else {
66          string[] names = new string[columns];
67          for (int i = 0; i < names.Length; i++) {
68            names[i] = "X" + i.ToString("000");
69          }
70          return names;
71        }
72      }
73    }
74
75    public TableFileParser() {
76      variableNames = new List<string>();
77    }
78
[9608]79    public bool AreColumnNamesInFirstLine(string fileName) {
80      NumberFormatInfo numberFormat;
81      DateTimeFormatInfo dateTimeFormatInfo;
82      char separator;
83      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
84      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
85        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
86      }
87    }
88
89    public bool AreColumnNamesInFirstLine(Stream stream) {
90      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
91      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
92      char separator = ',';
93      return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
94    }
95
96    public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
97                                         DateTimeFormatInfo dateTimeFormatInfo, char separator) {
98      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
99        return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
100      }
101    }
102
103    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
104                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
105      using (StreamReader reader = new StreamReader(stream)) {
106        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[13440]107        return (tokenizer.PeekType() != TokenTypeEnum.Double);
[9608]108      }
109    }
110
[7851]111    /// <summary>
112    /// Parses a file and determines the format first
113    /// </summary>
114    /// <param name="fileName">file which is parsed</param>
[9608]115    /// <param name="columnNamesInFirstLine"></param>
[13413]116    public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]117      NumberFormatInfo numberFormat;
118      DateTimeFormatInfo dateTimeFormatInfo;
119      char separator;
[9608]120      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
[13440]121      EstimateNumberOfLines(fileName);
[13413]122      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]123    }
124
125    /// <summary>
126    /// Parses a file with the given formats
127    /// </summary>
128    /// <param name="fileName">file which is parsed</param>
129    /// <param name="numberFormat">Format of numbers</param>
130    /// <param name="dateTimeFormatInfo">Format of datetime</param>
131    /// <param name="separator">defines the separator</param>
[9608]132    /// <param name="columnNamesInFirstLine"></param>
[13413]133    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
[13440]134      EstimateNumberOfLines(fileName);
[9608]135      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
[13413]136        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[9608]137      }
[7849]138    }
139
[13440]140    // determines the number of newline characters in the first 64KB to guess the number of rows for a file
141    private void EstimateNumberOfLines(string fileName) {
142      var len = new System.IO.FileInfo(fileName).Length;
143      var buf = new char[64 * 1024];
[13445]144      using(var reader = new StreamReader(fileName)) {
145        reader.ReadBlock(buf, 0, buf.Length);
146      }
[13440]147      int numNewLine = 0;
[13442]148      int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
149      foreach (var ch in buf) {
150        charsInCurrentLine++;
151        if (ch == '\n') {
152          if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
153          charsInCurrentLine = 0;
154          numNewLine++;
155        }
156      }
157      if (numNewLine <= 1) {
[13440]158        // fail -> keep the default setting
159        return;
160      } else {
[13442]161        double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
[13440]162        double estimatedLines = len / charsPerLineFactor;
163        estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
164      }
165    }
166
[7851]167    /// <summary>
168    /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
169    /// </summary>
170    /// <param name="stream">stream which is parsed</param>
[9608]171    /// <param name="columnNamesInFirstLine"></param>
[13413]172    public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]173      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
174      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
175      char separator = ',';
[13413]176      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]177    }
178
179    /// <summary>
180    /// Parses a stream with the given formats.
181    /// </summary>
182    /// <param name="stream">Stream which is parsed</param>   
183    /// <param name="numberFormat">Format of numbers</param>
184    /// <param name="dateTimeFormatInfo">Format of datetime</param>
185    /// <param name="separator">defines the separator</param>
[9608]186    /// <param name="columnNamesInFirstLine"></param>
[13413]187    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
[7849]188      using (StreamReader reader = new StreamReader(stream)) {
189        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[13440]190        // parse the file line by line
191        values = new List<IList>();
192        if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
193        foreach (var row in Parse(columnNamesInFirstLine, lineLimit)) {
194          columns = row.Count;
195          // on the first row we create our lists for column-oriented storage
196          if (!values.Any()) {
197            foreach (var obj in row) {
198              // create a list type matching the object type and add first element
199              if (obj == null) {
200                var l = new List<object>(estimatedNumberOfLines);
201                values.Add(l);
202                l.Add(obj);
203              } else if (obj is double) {
204                var l = new List<double>(estimatedNumberOfLines);
205                values.Add(l);
206                l.Add((double)obj);
207              } else if (obj is DateTime) {
208                var l = new List<DateTime>(estimatedNumberOfLines);
209                values.Add(l);
210                l.Add((DateTime)obj);
211              } else if (obj is string) {
212                var l = new List<string>(estimatedNumberOfLines);
213                values.Add(l);
214                l.Add((string)obj);
215              } else throw new InvalidOperationException();
216            }
217            // fill with initial value
218          } else {
219            // the columns are already there -> try to add values
220            int columnIndex = 0;
221            foreach (object element in row) {
222              if (values[columnIndex] is List<double> && !(element is double))
223                values[columnIndex].Add(double.NaN);
224              else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
225                values[columnIndex].Add(DateTime.MinValue);
226              else if (values[columnIndex] is List<string> && !(element is string))
227                values[columnIndex].Add(element.ToString());
228              else
229                values[columnIndex].Add(element);
230              columnIndex++;
231            }
232          }
[7849]233        }
234
[13440]235        if (!values.Any() || values.First().Count == 0)
236          Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[7849]237      }
238
[13441]239      this.rows = values.First().Count;
240
[13440]241      // after everything has been parsed make sure the lists are as compact as possible
242      foreach (var l in values) {
243        var dblList = l as List<double>;
244        var byteList = l as List<byte>;
245        var dateList = l as List<DateTime>;
246        var stringList = l as List<string>;
247        var objList = l as List<object>;
248        if (dblList != null) dblList.TrimExcess();
249        if (byteList != null) byteList.TrimExcess();
250        if (dateList != null) dateList.TrimExcess();
251        if (stringList != null) stringList.TrimExcess();
252        if (objList != null) objList.TrimExcess();
[7849]253      }
[13442]254
255      // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
256      GC.Collect(2, GCCollectionMode.Forced);
[7849]257    }
258
259    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
[8885]260      DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
[7849]261    }
262
263    public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
264      using (StreamReader reader = new StreamReader(stream)) {
265        // skip first line
266        reader.ReadLine();
267        // read a block
268        char[] buffer = new char[BUFFER_SIZE];
269        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
270        // count frequency of special characters
271        Dictionary<char, int> charCounts = buffer.Take(charsRead)
272          .GroupBy(c => c)
273          .ToDictionary(g => g.Key, g => g.Count());
274
275        // depending on the characters occuring in the block
276        // we distinghish a number of different cases based on the the following rules:
277        // many points => it must be English number format, the other frequently occuring char is the separator
278        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
279        //   => check the line in more detail:
280        //            English: 0, 0, 0, 0
281        //            German:  0,0 0,0 0,0 ...
282        //            => if commas are followed by space => English format
283        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
284        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
285        if (OccurrencesOf(charCounts, '.') > 10) {
286          numberFormat = NumberFormatInfo.InvariantInfo;
287          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
288          separator = POSSIBLE_SEPARATORS
289            .Where(c => OccurrencesOf(charCounts, c) > 10)
290            .OrderBy(c => -OccurrencesOf(charCounts, c))
291            .DefaultIfEmpty(' ')
292            .First();
293        } else if (OccurrencesOf(charCounts, ',') > 10) {
294          // no points and many commas
295          // count the number of tokens (chains of only digits and commas) that contain multiple comma characters
296          int tokensWithMultipleCommas = 0;
297          for (int i = 0; i < charsRead; i++) {
298            int nCommas = 0;
299            while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) {
300              if (buffer[i] == ',') nCommas++;
301              i++;
302            }
303            if (nCommas > 2) tokensWithMultipleCommas++;
304          }
305          if (tokensWithMultipleCommas > 1) {
306            // English format (only integer values) with ',' as separator
307            numberFormat = NumberFormatInfo.InvariantInfo;
308            dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
309            separator = ',';
310          } else {
311            char[] disallowedSeparators = new char[] { ',' };
312            // German format (real values)
313            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
314            dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
315            separator = POSSIBLE_SEPARATORS
316              .Except(disallowedSeparators)
317              .Where(c => OccurrencesOf(charCounts, c) > 10)
318              .OrderBy(c => -OccurrencesOf(charCounts, c))
319              .DefaultIfEmpty(' ')
320              .First();
321          }
322        } else {
323          // no points and no commas => English format
324          numberFormat = NumberFormatInfo.InvariantInfo;
325          dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
326          separator = POSSIBLE_SEPARATORS
327            .Where(c => OccurrencesOf(charCounts, c) > 10)
328            .OrderBy(c => -OccurrencesOf(charCounts, c))
329            .DefaultIfEmpty(' ')
330            .First();
331        }
332      }
333    }
334
335    private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
336      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
337    }
338
339    #region tokenizer
340    internal enum TokenTypeEnum {
341      NewLine, Separator, String, Double, DateTime
342    }
343
344    internal class Tokenizer {
345      private StreamReader reader;
[13411]346      // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
347      private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
348      private string[] stringVals = new string[1024];
349      private double[] doubleVals = new double[1024];
350      private DateTime[] dateTimeVals = new DateTime[1024];
351      private int tokenPos;
352      private int numTokens;
[7849]353      private NumberFormatInfo numberFormatInfo;
354      private DateTimeFormatInfo dateTimeFormatInfo;
355      private char separator;
356      private const string INTERNAL_SEPARATOR = "#";
357
358      private int currentLineNumber = 0;
359      public int CurrentLineNumber {
360        get { return currentLineNumber; }
361        private set { currentLineNumber = value; }
362      }
363      private string currentLine;
364      public string CurrentLine {
365        get { return currentLine; }
366        private set { currentLine = value; }
367      }
[13414]368      public long BytesRead {
369        get;
370        private set;
371      }
[7849]372
373      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
374        this.reader = reader;
375        this.numberFormatInfo = numberFormatInfo;
376        this.dateTimeFormatInfo = dateTimeFormatInfo;
377        this.separator = separator;
378        ReadNextTokens();
379      }
380
381      private void ReadNextTokens() {
382        if (!reader.EndOfStream) {
383          CurrentLine = reader.ReadLine();
[13414]384          try {
385            BytesRead = reader.BaseStream.Position;
[13441]386          } catch (IOException) {
[13414]387            BytesRead += CurrentLine.Length + 2; // guess
[13441]388          } catch (NotSupportedException) {
[13414]389            BytesRead += CurrentLine.Length + 2;
390          }
[13411]391          int i = 0;
392          foreach (var tok in Split(CurrentLine)) {
393            var trimmedStr = tok.Trim();
394            if (!string.IsNullOrEmpty(trimmedStr)) {
395              TokenTypeEnum type = TokenTypeEnum.String; // default
396              stringVals[i] = trimmedStr;
397              double doubleVal;
398              DateTime dateTimeValue;
399              if (trimmedStr.Equals(INTERNAL_SEPARATOR)) {
400                type = TokenTypeEnum.Separator;
401              } else if (double.TryParse(trimmedStr, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
402                type = TokenTypeEnum.Double;
403                doubleVals[i] = doubleVal;
404              } else if (DateTime.TryParse(trimmedStr, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
405                type = TokenTypeEnum.DateTime;
406                dateTimeVals[i] = dateTimeValue;
407              }
[7849]408
[13411]409              // couldn't parse the token as an int or float number  or datetime value so return a string token
410
411              tokenTypes[i] = type;
412              i++;
413
414              if (i >= tokenTypes.Length) {
415                // increase buffer size if necessary
416                IncreaseCapacity(ref tokenTypes);
417                IncreaseCapacity(ref doubleVals);
418                IncreaseCapacity(ref stringVals);
419                IncreaseCapacity(ref dateTimeVals);
420              }
421            }
422          }
423          tokenTypes[i] = TokenTypeEnum.NewLine;
424          numTokens = i + 1;
425          tokenPos = 0;
[7849]426        }
427      }
428
[13411]429      private static void IncreaseCapacity<T>(ref T[] arr) {
430        int n = (int)Math.Floor(arr.Length * 1.7); // guess
431        T[] arr2 = new T[n];
432        Array.Copy(arr, arr2, arr.Length);
433        arr = arr2;
434      }
435
[7849]436      private IEnumerable<string> Split(string line) {
[13411]437        string[] splitString;
[9652]438        if (separator == WHITESPACECHAR) {
439          //separate whitespaces
440          splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
441        } else {
442          splitString = line.Split(separator);
443        }
[7849]444
[13411]445        for (int i = 0; i < splitString.Length - 1; i++) {
446          yield return splitString[i];
447          yield return INTERNAL_SEPARATOR;
[7849]448        }
[13411]449        // do not return the INTERNAL_SEPARATOR after the last string
450        yield return splitString[splitString.Length - 1];
[7849]451      }
452
[13411]453      public TokenTypeEnum PeekType() {
454        return tokenTypes[tokenPos];
[7849]455      }
456
[13411]457      public void Skip() {
458        // simply skips one token without returning the result values
459        tokenPos++;
460        if (numTokens == tokenPos) {
[7849]461          ReadNextTokens();
462        }
463      }
464
[13411]465      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
466        type = tokenTypes[tokenPos];
467        strVal = stringVals[tokenPos];
468        dblVal = doubleVals[tokenPos];
469        dateTimeVal = dateTimeVals[tokenPos];
470        Skip();
471      }
472
[7849]473      public bool HasNext() {
[13411]474        return numTokens > tokenPos || !reader.EndOfStream;
[7849]475      }
476    }
477    #endregion
478
479    #region parsing
[13440]480    private IEnumerable<List<object>> Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit
[9608]481      if (columnNamesInFirstLine) {
482        ParseVariableNames();
483        if (!tokenizer.HasNext())
484          Error(
485            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
486            "", tokenizer.CurrentLineNumber);
487      }
[13440]488      return ParseValues(lineLimit);
[7849]489    }
490
[13440]491    private IEnumerable<List<object>> ParseValues(int lineLimit = -1) {
[13413]492      int nLinesParsed = 0;
[13440]493      int numValuesInFirstRow = -1;
[13413]494      while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
[13411]495        if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
496          tokenizer.Skip();
[13413]497          nLinesParsed++;
[7849]498        } else {
499          List<object> row = new List<object>();
500          object value = NextValue(tokenizer);
501          row.Add(value);
[13411]502          while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
503            ExpectType(TokenTypeEnum.Separator);
[7849]504            row.Add(NextValue(tokenizer));
505          }
[13411]506          ExpectType(TokenTypeEnum.NewLine);
[13413]507          nLinesParsed++;
[7849]508          // all rows have to have the same number of values           
509          // the first row defines how many samples are needed
[13440]510          if (numValuesInFirstRow < 0) numValuesInFirstRow = row.Count;
511          else if (numValuesInFirstRow != row.Count) {
512            Error("The first row of the dataset has " + numValuesInFirstRow + " columns." +
[7849]513                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
514                  tokenizer.CurrentLineNumber);
515          }
[13440]516          yield return row;
[7849]517        }
[13414]518
519        OnReport(tokenizer.BytesRead);
[7849]520      }
521    }
522
523    private object NextValue(Tokenizer tokenizer) {
[13411]524      if (tokenizer.PeekType() == TokenTypeEnum.Separator || tokenizer.PeekType() == TokenTypeEnum.NewLine) return string.Empty;
525      TokenTypeEnum type;
526      string strVal;
527      double dblVal;
528      DateTime dateTimeVal;
529
530      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
531      switch (type) {
532        case TokenTypeEnum.Separator: return double.NaN;
533        case TokenTypeEnum.String: return strVal;
534        case TokenTypeEnum.Double: return dblVal;
535        case TokenTypeEnum.DateTime: return dateTimeVal;
[7849]536      }
537      // found an unexpected token => throw error
[13411]538      Error("Unexpected token.", strVal, tokenizer.CurrentLineNumber);
[7849]539      // this line is never executed because Error() throws an exception
540      throw new InvalidOperationException();
541    }
542
543    private void ParseVariableNames() {
544      // the first line must contain variable names
[13411]545      List<string> varNames = new List<string>();
546
547      TokenTypeEnum type;
548      string strVal;
549      double dblVal;
550      DateTime dateTimeVal;
551
552      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
553
554      // the first token must be a variable name
555      if (type != TokenTypeEnum.String)
556        throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
557      varNames.Add(strVal);
558
559      while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
560        ExpectType(TokenTypeEnum.Separator);
561        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
562        varNames.Add(strVal);
[7849]563      }
[13411]564      ExpectType(TokenTypeEnum.NewLine);
565
566      variableNames = varNames;
[7849]567    }
568
[13411]569    private void ExpectType(TokenTypeEnum expectedToken) {
570      if (tokenizer.PeekType() != expectedToken)
571        throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
572      tokenizer.Skip();
[7849]573    }
574
575    private void Error(string message, string token, int lineNumber) {
576      throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
577    }
578    #endregion
579
580    [Serializable]
[9449]581    public class DataFormatException : Exception {
[7849]582      private int line;
583      public int Line {
584        get { return line; }
585      }
586      private string token;
587      public string Token {
588        get { return token; }
589      }
590      public DataFormatException(string message, string token, int line)
591        : base(message + "\nToken: " + token + " (line: " + line + ")") {
592        this.token = token;
593        this.line = line;
594      }
595
596      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
597    }
598  }
599}
Note: See TracBrowser for help on using the repository browser.