Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
07/02/16 08:15:07 (8 years ago)
Author:
gkronber
Message:

#2071: merged r13411,r13413,r13414,r13415,r13419,r13440,r13441,r13442,r13445,r13447,r13525,r13526,r13529,r13584,r13901,r13925 from trunk to stable

Location:
stable
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • stable

  • stable/HeuristicLab.Problems.Instances.DataAnalysis

  • stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/DataAnalysisInstanceProvider.cs

    r12009 r13974  
    2323using System.Collections;
    2424using System.Collections.Generic;
     25using System.ComponentModel;
    2526using System.Globalization;
    2627using System.IO;
     
    3536    where ImportType : DataAnalysisImportType {
    3637
     38    public event ProgressChangedEventHandler ProgressChanged;
    3739
    3840    public TData ImportData(string path, ImportType type, DataAnalysisCSVFormat csvFormat) {
    3941      TableFileParser csvFileParser = new TableFileParser();
     42      long fileSize = new FileInfo(path).Length;
     43      csvFileParser.ProgressChanged += (sender, e) => {
     44        OnProgressChanged(e / (double)fileSize);
     45      };
    4046      csvFileParser.Parse(path, csvFormat.NumberFormatInfo, csvFormat.DateTimeFormatInfo, csvFormat.Separator, csvFormat.VariableNamesAvailable);
    4147      return ImportData(path, type, csvFileParser);
     48    }
     49
     50    protected virtual void OnProgressChanged(double d) {
     51      var handler = ProgressChanged;
     52      if (handler != null)
     53        handler(this, new ProgressChangedEventArgs((int)(100 * d), null));
    4254    }
    4355
     
    89101        strBuilder.AppendLine();
    90102      }
    91 
    92       using (var writer = new StreamWriter(path)) {
    93         writer.Write(strBuilder);
     103      using (var fileStream = new FileStream(path, FileMode.Create)) {
     104        Encoding encoding = Encoding.GetEncoding(Encoding.Default.CodePage,
     105          new EncoderReplacementFallback("*"),
     106          new DecoderReplacementFallback("*"));
     107        using (var writer = new StreamWriter(fileStream, encoding)) {
     108          writer.Write(strBuilder);
     109        }
    94110      }
    95111    }
  • stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs

    r12009 r13974  
    2424using System.Collections;
    2525using System.Collections.Generic;
     26using System.Diagnostics.Contracts;
    2627using System.Globalization;
    2728using System.IO;
    2829using System.Linq;
    2930using System.Runtime.Serialization;
     31using System.Text;
    3032
    3133namespace HeuristicLab.Problems.Instances.DataAnalysis {
    32   public class TableFileParser {
     34  public class TableFileParser : Progress<long> { // reports the number of bytes read
    3335    private const int BUFFER_SIZE = 65536;
    3436    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
     
    3638    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
    3739    private Tokenizer tokenizer;
    38     private List<List<object>> rowValues;
     40    private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
     41
     42
     43    private Encoding encoding = Encoding.Default;
     44
     45    public Encoding Encoding {
     46      get { return encoding; }
     47      set {
     48        if (value == null) throw new ArgumentNullException("Encoding");
     49        encoding = value;
     50      }
     51    }
     52
    3953
    4054    private int rows;
     
    7286
    7387    public TableFileParser() {
    74       rowValues = new List<List<object>>();
    7588      variableNames = new List<string>();
    7689    }
     
    102115    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
    103116                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
    104       using (StreamReader reader = new StreamReader(stream)) {
     117      using (StreamReader reader = new StreamReader(stream, Encoding)) {
    105118        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
    106         return tokenizer.Peek().type != TokenTypeEnum.Double;
     119        return (tokenizer.PeekType() != TokenTypeEnum.Double);
    107120      }
    108121    }
     
    113126    /// <param name="fileName">file which is parsed</param>
    114127    /// <param name="columnNamesInFirstLine"></param>
    115     public void Parse(string fileName, bool columnNamesInFirstLine) {
     128    public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
    116129      NumberFormatInfo numberFormat;
    117130      DateTimeFormatInfo dateTimeFormatInfo;
    118131      char separator;
    119132      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
    120       Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
     133      EstimateNumberOfLines(fileName);
     134      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
    121135    }
    122136
     
    129143    /// <param name="separator">defines the separator</param>
    130144    /// <param name="columnNamesInFirstLine"></param>
    131     public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
     145    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
     146      EstimateNumberOfLines(fileName);
    132147      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
    133         Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
     148        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
     149      }
     150    }
     151
     152    // determines the number of newline characters in the first 64KB to guess the number of rows for a file
     153    private void EstimateNumberOfLines(string fileName) {
     154      var len = new System.IO.FileInfo(fileName).Length;
     155      var buf = new char[1024 * 1024];
     156      using (var reader = new StreamReader(fileName, Encoding)) {
     157        reader.ReadBlock(buf, 0, buf.Length);
     158      }
     159      int numNewLine = 0;
     160      int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
     161      foreach (var ch in buf) {
     162        charsInCurrentLine++;
     163        if (ch == '\n') {
     164          if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
     165          charsInCurrentLine = 0;
     166          numNewLine++;
     167        }
     168      }
     169      if (numNewLine <= 1) {
     170        // fail -> keep the default setting
     171        return;
     172      } else {
     173        double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
     174        double estimatedLines = len / charsPerLineFactor;
     175        estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
    134176      }
    135177    }
     
    140182    /// <param name="stream">stream which is parsed</param>
    141183    /// <param name="columnNamesInFirstLine"></param>
    142     public void Parse(Stream stream, bool columnNamesInFirstLine) {
     184    public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
    143185      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
    144186      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
    145187      char separator = ',';
    146       Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
     188      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
    147189    }
    148190
     
    155197    /// <param name="separator">defines the separator</param>
    156198    /// <param name="columnNamesInFirstLine"></param>
    157     public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
    158       using (StreamReader reader = new StreamReader(stream)) {
     199    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
     200      using (StreamReader reader = new StreamReader(stream, Encoding)) {
    159201        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
    160         // parse the file
    161         Parse(columnNamesInFirstLine);
    162       }
    163 
    164       // translate the list of samples into a DoubleMatrixData item
    165       rows = rowValues.Count;
    166       columns = rowValues[0].Count;
    167       values = new List<IList>();
    168 
    169       //create columns
    170       for (int col = 0; col < columns; col++) {
    171         var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType());
    172         if (!types.Any()) {
    173           values.Add(new List<string>());
    174           continue;
     202        values = new List<IList>();
     203        if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
     204
     205        if (columnNamesInFirstLine) {
     206          ParseVariableNames();
     207          if (!tokenizer.HasNext())
     208            Error(
     209              "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
     210              "", tokenizer.CurrentLineNumber);
    175211        }
    176212
    177         var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
    178         if (columnType == typeof(double)) values.Add(new List<double>());
    179         else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
    180         else if (columnType == typeof(string)) values.Add(new List<string>());
    181         else throw new InvalidOperationException();
    182       }
    183 
    184 
    185 
    186       //fill with values
    187       foreach (List<object> row in rowValues) {
    188         int columnIndex = 0;
    189         foreach (object element in row) {
    190           if (values[columnIndex] is List<double> && !(element is double))
    191             values[columnIndex].Add(double.NaN);
    192           else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
    193             values[columnIndex].Add(DateTime.MinValue);
    194           else if (values[columnIndex] is List<string> && !(element is string))
    195             values[columnIndex].Add(element.ToString());
    196           else
    197             values[columnIndex].Add(element);
    198           columnIndex++;
     213
     214        // read values... start in first row
     215        int nLinesParsed = 0;
     216        int colIdx = 0;
     217        int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
     218        while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
     219          if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
     220            tokenizer.Skip();
     221
     222            // all rows have to have the same number of values
     223            // the first row defines how many samples are needed
     224            if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
     225            else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
     226              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     227                    "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
     228                    tokenizer.CurrentLineNumber);
     229            }
     230            OnReport(tokenizer.BytesRead);
     231
     232            nLinesParsed++;
     233            colIdx = 0;
     234          } else {
     235            // read one value
     236            TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
     237            tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
     238
     239            // initialize columns on the first row (fixing data types as presented in the first row...)
     240            if (nLinesParsed == 0) {
     241              values.Add(CreateList(type, estimatedNumberOfLines));
     242            } else if (colIdx == values.Count) {
     243              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     244                    "Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
     245                tokenizer.CurrentLineNumber);
     246            }
     247            if (!IsColumnTypeCompatible(values[colIdx], type)) {
     248              values[colIdx] = ConvertToStringColumn(values[colIdx]);
     249            }
     250            // add the value to the column
     251            AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
     252          }
    199253        }
    200       }
    201     }
     254
     255        if (!values.Any() || values.First().Count == 0)
     256          Error("Couldn't parse data values. Probably because of incorrect number format " +
     257                "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
     258      }
     259
     260      this.rows = values.First().Count;
     261      this.columns = values.Count;
     262
     263      // after everything has been parsed make sure the lists are as compact as possible
     264      foreach (var l in values) {
     265        var dblList = l as List<double>;
     266        var byteList = l as List<byte>;
     267        var dateList = l as List<DateTime>;
     268        var stringList = l as List<string>;
     269        var objList = l as List<object>;
     270        if (dblList != null) dblList.TrimExcess();
     271        if (byteList != null) byteList.TrimExcess();
     272        if (dateList != null) dateList.TrimExcess();
     273        if (stringList != null) stringList.TrimExcess();
     274        if (objList != null) objList.TrimExcess();
     275      }
     276
     277      // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
     278      GC.Collect(2, GCCollectionMode.Forced);
     279    }
     280
     281    #region type-dependent dispatch
     282    private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
     283      return (list is List<string>) || // all tokens can be added to a string list
     284             (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns
     285             (tokenType == TokenTypeEnum.Double && list is List<double>) ||
     286             (tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
     287    }
     288
     289    // all columns are converted to string columns when we find an non-empty value that has incorrect type
     290    private IList ConvertToStringColumn(IList list) {
     291      var dblL = list as List<double>;
     292      if (dblL != null) {
     293        var l = new List<string>(dblL.Capacity);
     294        l.AddRange(dblL.Select(dbl => dbl.ToString()));
     295        return l;
     296      }
     297
     298      var dtL = list as List<DateTime>;
     299      if (dtL != null) {
     300        var l = new List<string>(dtL.Capacity);
     301        l.AddRange(dtL.Select(dbl => dbl.ToString()));
     302        return l;
     303      }
     304
     305      if (list is List<string>) return list;
     306
     307      throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
     308    }
     309
     310    private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
     311      var dblList = list as List<double>;
     312      if (dblList != null) {
     313        AddValue(type, dblList, dblVal);
     314        return;
     315      }
     316
     317      var strList = list as List<string>;
     318      if (strList != null) {
     319        AddValue(type, strList, strVal);
     320        return;
     321      }
     322      var dtList = list as List<DateTime>;
     323      if (dtList != null) {
     324        AddValue(type, dtList, dateTimeVal);
     325        return;
     326      }
     327
     328      list.Add(strVal); // assumes List<object>
     329    }
     330
     331    private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
     332      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double);
     333      list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
     334    }
     335
     336    private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
     337      // assumes that strVal is always set to the original token read from the input file
     338      list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
     339    }
     340
     341    private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
     342      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime);
     343      list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
     344    }
     345
     346    private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
     347      switch (type) {
     348        case TokenTypeEnum.String:
     349          return new List<string>(estimatedNumberOfLines);
     350        case TokenTypeEnum.Double:
     351        case TokenTypeEnum.Missing: // assume double columns
     352          return new List<double>(estimatedNumberOfLines);
     353        case TokenTypeEnum.DateTime:
     354          return new List<DateTime>(estimatedNumberOfLines);
     355        default:
     356          throw new InvalidOperationException();
     357      }
     358    }
     359    #endregion
    202360
    203361    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
     
    253411            separator = ',';
    254412          } else {
    255             char[] disallowedSeparators = new char[] { ',' };
     413            char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
    256414            // German format (real values)
    257415            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
     
    282440
    283441    #region tokenizer
     442    // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
    284443    internal enum TokenTypeEnum {
    285       NewLine, Separator, String, Double, DateTime
    286     }
    287 
    288     internal class Token {
    289       public TokenTypeEnum type;
    290       public string stringValue;
    291       public double doubleValue;
    292       public DateTime dateTimeValue;
    293 
    294       public Token(TokenTypeEnum type, string value) {
    295         this.type = type;
    296         stringValue = value;
    297         dateTimeValue = DateTime.MinValue;
    298         doubleValue = 0.0;
    299       }
    300 
    301       public override string ToString() {
    302         return stringValue;
    303       }
    304     }
    305 
     444      NewLine, String, Double, DateTime, Missing
     445    }
    306446
    307447    internal class Tokenizer {
    308448      private StreamReader reader;
    309       private List<Token> tokens;
     449      // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
     450      private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
     451      private string[] stringVals = new string[1024];
     452      private double[] doubleVals = new double[1024];
     453      private DateTime[] dateTimeVals = new DateTime[1024];
     454      private int tokenPos;
     455      private int numTokens;
    310456      private NumberFormatInfo numberFormatInfo;
    311457      private DateTimeFormatInfo dateTimeFormatInfo;
    312458      private char separator;
    313       private const string INTERNAL_SEPARATOR = "#";
     459
     460      // arrays for string.Split()
     461      private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
     462      private readonly char[] separators;
    314463
    315464      private int currentLineNumber = 0;
     
    323472        private set { currentLine = value; }
    324473      }
    325 
    326       private Token newlineToken;
    327       public Token NewlineToken {
    328         get { return newlineToken; }
    329         private set { newlineToken = value; }
    330       }
    331       private Token separatorToken;
    332       public Token SeparatorToken {
    333         get { return separatorToken; }
    334         private set { separatorToken = value; }
     474      public long BytesRead {
     475        get;
     476        private set;
    335477      }
    336478
     
    340482        this.dateTimeFormatInfo = dateTimeFormatInfo;
    341483        this.separator = separator;
    342         separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
    343         newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
    344         tokens = new List<Token>();
     484        this.separators = new char[] { separator };
    345485        ReadNextTokens();
     486      }
     487
     488      public bool HasNext() {
     489        return numTokens > tokenPos || !reader.EndOfStream;
     490      }
     491
     492      public TokenTypeEnum PeekType() {
     493        return tokenTypes[tokenPos];
     494      }
     495
     496      public void Skip() {
     497        // simply skips one token without returning the result values
     498        tokenPos++;
     499        if (numTokens == tokenPos) {
     500          ReadNextTokens();
     501        }
     502      }
     503
     504      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
     505        type = tokenTypes[tokenPos];
     506        strVal = stringVals[tokenPos];
     507        dblVal = doubleVals[tokenPos];
     508        dateTimeVal = dateTimeVals[tokenPos];
     509        Skip();
    346510      }
    347511
     
    349513        if (!reader.EndOfStream) {
    350514          CurrentLine = reader.ReadLine();
    351           var newTokens = from str in Split(CurrentLine)
    352                           let trimmedStr = str.Trim()
    353                           where !string.IsNullOrEmpty(trimmedStr)
    354                           select MakeToken(trimmedStr);
    355 
    356           tokens.AddRange(newTokens);
    357           tokens.Add(NewlineToken);
    358515          CurrentLineNumber++;
     516          if (reader.BaseStream.CanSeek) {
     517            BytesRead = reader.BaseStream.Position;
     518          } else {
     519            BytesRead += CurrentLine.Length + 2; // guess
     520          }
     521          int i = 0;
     522          if (!string.IsNullOrWhiteSpace(CurrentLine)) {
     523            foreach (var tok in Split(CurrentLine)) {
     524              TokenTypeEnum type;
     525              double doubleVal;
     526              DateTime dateTimeValue;
     527              type = TokenTypeEnum.String; // default
     528              stringVals[i] = tok.Trim();
     529              if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
     530                type = TokenTypeEnum.Double;
     531                doubleVals[i] = doubleVal;
     532              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
     533                type = TokenTypeEnum.DateTime;
     534                dateTimeVals[i] = dateTimeValue;
     535              } else if (string.IsNullOrWhiteSpace(tok)) {
     536                type = TokenTypeEnum.Missing;
     537              }
     538
     539              // couldn't parse the token as an int or float number or datetime value so return a string token
     540
     541              tokenTypes[i] = type;
     542              i++;
     543
     544              if (i >= tokenTypes.Length) {
     545                // increase buffer size if necessary
     546                IncreaseCapacity(ref tokenTypes);
     547                IncreaseCapacity(ref doubleVals);
     548                IncreaseCapacity(ref stringVals);
     549                IncreaseCapacity(ref dateTimeVals);
     550              }
     551            }
     552          }
     553          tokenTypes[i] = TokenTypeEnum.NewLine;
     554          numTokens = i + 1;
     555          tokenPos = 0;
    359556        }
    360557      }
    361558
    362559      private IEnumerable<string> Split(string line) {
    363         IEnumerable<string> splitString;
    364         if (separator == WHITESPACECHAR) {
    365           //separate whitespaces
    366           splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
    367         } else {
    368           splitString = line.Split(separator);
    369         }
    370         int cur = splitString.Count();
    371         foreach (var str in splitString) {
    372           yield return str;
    373           cur--;
    374           // do not return the INTERNAL_SEPARATOR after the last string
    375           if (cur != 0) {
    376             yield return INTERNAL_SEPARATOR;
    377           }
    378         }
    379       }
    380 
    381       private Token MakeToken(string strToken) {
    382         Token token = new Token(TokenTypeEnum.String, strToken);
    383         if (strToken.Equals(INTERNAL_SEPARATOR)) {
    384           return SeparatorToken;
    385         } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
    386           token.type = TokenTypeEnum.Double;
    387           return token;
    388         } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
    389           token.type = TokenTypeEnum.DateTime;
    390           return token;
    391         }
    392 
    393         // couldn't parse the token as an int or float number  or datetime value so return a string token
    394         return token;
    395       }
    396 
    397       public Token Peek() {
    398         return tokens[0];
    399       }
    400 
    401       public Token Next() {
    402         Token next = tokens[0];
    403         tokens.RemoveAt(0);
    404         if (tokens.Count == 0) {
    405           ReadNextTokens();
    406         }
    407         return next;
    408       }
    409 
    410       public bool HasNext() {
    411         return tokens.Count > 0 || !reader.EndOfStream;
     560        return separator == WHITESPACECHAR ?
     561          line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
     562          line.Split(separators);
     563      }
     564
     565      private static void IncreaseCapacity<T>(ref T[] arr) {
     566        int n = (int)Math.Floor(arr.Length * 1.7); // guess
     567        T[] arr2 = new T[n];
     568        Array.Copy(arr, arr2, arr.Length);
     569        arr = arr2;
    412570      }
    413571    }
     
    415573
    416574    #region parsing
    417     private void Parse(bool columnNamesInFirstLine) {
    418       if (columnNamesInFirstLine) {
    419         ParseVariableNames();
    420         if (!tokenizer.HasNext())
    421           Error(
    422             "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
    423             "", tokenizer.CurrentLineNumber);
    424       }
    425       ParseValues();
    426       if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    427     }
    428 
    429     private void ParseValues() {
    430       while (tokenizer.HasNext()) {
    431         if (tokenizer.Peek() == tokenizer.NewlineToken) {
    432           tokenizer.Next();
    433         } else {
    434           List<object> row = new List<object>();
    435           object value = NextValue(tokenizer);
    436           row.Add(value);
    437           while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
    438             Expect(tokenizer.SeparatorToken);
    439             row.Add(NextValue(tokenizer));
    440           }
    441           Expect(tokenizer.NewlineToken);
    442           // all rows have to have the same number of values           
    443           // the first row defines how many samples are needed
    444           if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
    445             Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
    446                   "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
    447                   tokenizer.CurrentLineNumber);
    448           }
    449           rowValues.Add(row);
    450         }
    451       }
    452     }
    453 
    454     private object NextValue(Tokenizer tokenizer) {
    455       if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
    456       Token current = tokenizer.Next();
    457       if (current.type == TokenTypeEnum.Separator) {
    458         return double.NaN;
    459       } else if (current.type == TokenTypeEnum.String) {
    460         return current.stringValue;
    461       } else if (current.type == TokenTypeEnum.Double) {
    462         return current.doubleValue;
    463       } else if (current.type == TokenTypeEnum.DateTime) {
    464         return current.dateTimeValue;
    465       }
    466       // found an unexpected token => throw error
    467       Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
    468       // this line is never executed because Error() throws an exception
    469       throw new InvalidOperationException();
    470     }
    471575
    472576    private void ParseVariableNames() {
    473577      // the first line must contain variable names
    474       List<Token> tokens = new List<Token>();
    475       Token valueToken;
    476       valueToken = tokenizer.Next();
    477       tokens.Add(valueToken);
    478       while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
    479         Expect(tokenizer.SeparatorToken);
    480         valueToken = tokenizer.Next();
    481         if (valueToken != tokenizer.NewlineToken) {
    482           tokens.Add(valueToken);
    483         }
    484       }
    485       if (valueToken != tokenizer.NewlineToken) {
    486         Expect(tokenizer.NewlineToken);
    487       }
    488       variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
    489     }
    490 
    491     private void Expect(Token expectedToken) {
    492       Token actualToken = tokenizer.Next();
    493       if (actualToken != expectedToken) {
    494         Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
    495       }
     578      List<string> varNames = new List<string>();
     579
     580      TokenTypeEnum type;
     581      string strVal;
     582      double dblVal;
     583      DateTime dateTimeVal;
     584
     585      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
     586
     587      // the first token must be a variable name
     588      if (type != TokenTypeEnum.String)
     589        throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
     590      varNames.Add(strVal);
     591
     592      while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
     593        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
     594        varNames.Add(strVal);
     595      }
     596      ExpectType(TokenTypeEnum.NewLine);
     597
     598      variableNames = varNames;
     599    }
     600
     601    private void ExpectType(TokenTypeEnum expectedToken) {
     602      if (tokenizer.PeekType() != expectedToken)
     603        throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
     604      tokenizer.Skip();
    496605    }
    497606
Note: See TracChangeset for help on using the changeset viewer.