Changeset 13974 for stable/HeuristicLab.Problems.Instances.DataAnalysis
- Timestamp:
- 07/02/16 08:15:07 (8 years ago)
- Location:
- stable
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
stable
- Property svn:mergeinfo changed
/trunk/sources merged: 13411,13413-13415,13419,13440-13442,13445,13447,13525-13526,13529,13584,13901,13925
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Problems.Instances.DataAnalysis
- Property svn:mergeinfo changed
/trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis merged: 13411,13413-13414,13440-13442,13445,13447,13526,13584,13901,13925
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/DataAnalysisInstanceProvider.cs
r12009 r13974 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.ComponentModel; 25 26 using System.Globalization; 26 27 using System.IO; … … 35 36 where ImportType : DataAnalysisImportType { 36 37 38 public event ProgressChangedEventHandler ProgressChanged; 37 39 38 40 public TData ImportData(string path, ImportType type, DataAnalysisCSVFormat csvFormat) { 39 41 TableFileParser csvFileParser = new TableFileParser(); 42 long fileSize = new FileInfo(path).Length; 43 csvFileParser.ProgressChanged += (sender, e) => { 44 OnProgressChanged(e / (double)fileSize); 45 }; 40 46 csvFileParser.Parse(path, csvFormat.NumberFormatInfo, csvFormat.DateTimeFormatInfo, csvFormat.Separator, csvFormat.VariableNamesAvailable); 41 47 return ImportData(path, type, csvFileParser); 48 } 49 50 protected virtual void OnProgressChanged(double d) { 51 var handler = ProgressChanged; 52 if (handler != null) 53 handler(this, new ProgressChangedEventArgs((int)(100 * d), null)); 42 54 } 43 55 … … 89 101 strBuilder.AppendLine(); 90 102 } 91 92 using (var writer = new StreamWriter(path)) { 93 writer.Write(strBuilder); 103 using (var fileStream = new FileStream(path, FileMode.Create)) { 104 Encoding encoding = Encoding.GetEncoding(Encoding.Default.CodePage, 105 new EncoderReplacementFallback("*"), 106 new DecoderReplacementFallback("*")); 107 using (var writer = new StreamWriter(fileStream, encoding)) { 108 writer.Write(strBuilder); 109 } 94 110 } 95 111 } -
stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r12009 r13974 24 24 using System.Collections; 25 25 using System.Collections.Generic; 26 using System.Diagnostics.Contracts; 26 27 using System.Globalization; 27 28 using System.IO; 28 29 using System.Linq; 29 30 using System.Runtime.Serialization; 31 using System.Text; 30 32 31 33 namespace HeuristicLab.Problems.Instances.DataAnalysis { 32 public class TableFileParser {34 public class TableFileParser : Progress<long> { // reports the number of bytes read 33 35 private const int BUFFER_SIZE = 65536; 34 36 // char used to symbolize whitespaces (no missing values can be handled with whitespaces) … … 36 38 private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR }; 37 39 private Tokenizer tokenizer; 38 private List<List<object>> rowValues; 40 private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file 41 42 43 private Encoding encoding = Encoding.Default; 44 45 public Encoding Encoding { 46 get { return encoding; } 47 set { 48 if (value == null) throw new ArgumentNullException("Encoding"); 49 encoding = value; 50 } 51 } 52 39 53 40 54 private int rows; … … 72 86 73 87 public TableFileParser() { 74 rowValues = new List<List<object>>();75 88 variableNames = new List<string>(); 76 89 } … … 102 115 public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat, 103 116 DateTimeFormatInfo dateTimeFormatInfo, char separator) { 104 using (StreamReader reader = new StreamReader(stream )) {117 using (StreamReader reader = new StreamReader(stream, Encoding)) { 105 118 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 106 return tokenizer.Peek().type != TokenTypeEnum.Double;119 return (tokenizer.PeekType() != TokenTypeEnum.Double); 107 120 } 108 121 } … … 113 126 /// <param name="fileName">file which is parsed</param> 114 127 /// <param name="columnNamesInFirstLine"></param> 115 public void Parse(string fileName, bool columnNamesInFirstLine ) {128 public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) { 116 129 NumberFormatInfo numberFormat; 117 130 DateTimeFormatInfo dateTimeFormatInfo; 118 131 char separator; 119 132 DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator); 120 Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine); 133 EstimateNumberOfLines(fileName); 134 Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 121 135 } 122 136 … … 129 143 /// <param name="separator">defines the separator</param> 130 144 /// <param name="columnNamesInFirstLine"></param> 131 public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) { 145 public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) { 146 EstimateNumberOfLines(fileName); 132 147 using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { 133 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine); 148 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 149 } 150 } 151 152 // determines the number of newline characters in the first 64KB to guess the number of rows for a file 153 private void EstimateNumberOfLines(string fileName) { 154 var len = new System.IO.FileInfo(fileName).Length; 155 var buf = new char[1024 * 1024]; 156 using (var reader = new StreamReader(fileName, Encoding)) { 157 reader.ReadBlock(buf, 0, buf.Length); 158 } 159 int numNewLine = 0; 160 int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative 161 foreach (var ch in buf) { 162 charsInCurrentLine++; 163 if (ch == '\n') { 164 if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line 165 charsInCurrentLine = 0; 166 numNewLine++; 167 } 168 } 169 if (numNewLine <= 1) { 170 // fail -> keep the default setting 171 return; 172 } else { 173 double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1); 174 double estimatedLines = len / charsPerLineFactor; 175 estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough 134 176 } 135 177 } … … 140 182 /// <param name="stream">stream which is parsed</param> 141 183 /// <param name="columnNamesInFirstLine"></param> 142 public void Parse(Stream stream, bool columnNamesInFirstLine ) {184 public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) { 143 185 NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo; 144 186 DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 145 187 char separator = ','; 146 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine );188 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 147 189 } 148 190 … … 155 197 /// <param name="separator">defines the separator</param> 156 198 /// <param name="columnNamesInFirstLine"></param> 157 public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine ) {158 using (StreamReader reader = new StreamReader(stream )) {199 public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) { 200 using (StreamReader reader = new StreamReader(stream, Encoding)) { 159 201 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 160 // parse the file 161 Parse(columnNamesInFirstLine); 162 } 163 164 // translate the list of samples into a DoubleMatrixData item 165 rows = rowValues.Count; 166 columns = rowValues[0].Count; 167 values = new List<IList>(); 168 169 //create columns 170 for (int col = 0; col < columns; col++) { 171 var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType()); 172 if (!types.Any()) { 173 values.Add(new List<string>()); 174 continue; 202 values = new List<IList>(); 203 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 204 205 if (columnNamesInFirstLine) { 206 ParseVariableNames(); 207 if (!tokenizer.HasNext()) 208 Error( 209 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 210 "", tokenizer.CurrentLineNumber); 175 211 } 176 212 177 var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key; 178 if (columnType == typeof(double)) values.Add(new List<double>()); 179 else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>()); 180 else if (columnType == typeof(string)) values.Add(new List<string>()); 181 else throw new InvalidOperationException(); 182 } 183 184 185 186 //fill with values 187 foreach (List<object> row in rowValues) { 188 int columnIndex = 0; 189 foreach (object element in row) { 190 if (values[columnIndex] is List<double> && !(element is double)) 191 values[columnIndex].Add(double.NaN); 192 else if (values[columnIndex] is List<DateTime> && !(element is DateTime)) 193 values[columnIndex].Add(DateTime.MinValue); 194 else if (values[columnIndex] is List<string> && !(element is string)) 195 values[columnIndex].Add(element.ToString()); 196 else 197 values[columnIndex].Add(element); 198 columnIndex++; 213 214 // read values... start in first row 215 int nLinesParsed = 0; 216 int colIdx = 0; 217 int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1) 218 while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { 219 if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { 220 tokenizer.Skip(); 221 222 // all rows have to have the same number of values 223 // the first row defines how many samples are needed 224 if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row 225 else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines) 226 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine + 227 "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "", 228 tokenizer.CurrentLineNumber); 229 } 230 OnReport(tokenizer.BytesRead); 231 232 nLinesParsed++; 233 colIdx = 0; 234 } else { 235 // read one value 236 TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; 237 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 238 239 // initialize columns on the first row (fixing data types as presented in the first row...) 240 if (nLinesParsed == 0) { 241 values.Add(CreateList(type, estimatedNumberOfLines)); 242 } else if (colIdx == values.Count) { 243 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine + 244 "Line " + tokenizer.CurrentLineNumber + " has more columns.", "", 245 tokenizer.CurrentLineNumber); 246 } 247 if (!IsColumnTypeCompatible(values[colIdx], type)) { 248 values[colIdx] = ConvertToStringColumn(values[colIdx]); 249 } 250 // add the value to the column 251 AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal); 252 } 199 253 } 200 } 201 } 254 255 if (!values.Any() || values.First().Count == 0) 256 Error("Couldn't parse data values. Probably because of incorrect number format " + 257 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 258 } 259 260 this.rows = values.First().Count; 261 this.columns = values.Count; 262 263 // after everything has been parsed make sure the lists are as compact as possible 264 foreach (var l in values) { 265 var dblList = l as List<double>; 266 var byteList = l as List<byte>; 267 var dateList = l as List<DateTime>; 268 var stringList = l as List<string>; 269 var objList = l as List<object>; 270 if (dblList != null) dblList.TrimExcess(); 271 if (byteList != null) byteList.TrimExcess(); 272 if (dateList != null) dateList.TrimExcess(); 273 if (stringList != null) stringList.TrimExcess(); 274 if (objList != null) objList.TrimExcess(); 275 } 276 277 // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction) 278 GC.Collect(2, GCCollectionMode.Forced); 279 } 280 281 #region type-dependent dispatch 282 private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) { 283 return (list is List<string>) || // all tokens can be added to a string list 284 (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns 285 (tokenType == TokenTypeEnum.Double && list is List<double>) || 286 (tokenType == TokenTypeEnum.DateTime && list is List<DateTime>); 287 } 288 289 // all columns are converted to string columns when we find an non-empty value that has incorrect type 290 private IList ConvertToStringColumn(IList list) { 291 var dblL = list as List<double>; 292 if (dblL != null) { 293 var l = new List<string>(dblL.Capacity); 294 l.AddRange(dblL.Select(dbl => dbl.ToString())); 295 return l; 296 } 297 298 var dtL = list as List<DateTime>; 299 if (dtL != null) { 300 var l = new List<string>(dtL.Capacity); 301 l.AddRange(dtL.Select(dbl => dbl.ToString())); 302 return l; 303 } 304 305 if (list is List<string>) return list; 306 307 throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType())); 308 } 309 310 private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) { 311 var dblList = list as List<double>; 312 if (dblList != null) { 313 AddValue(type, dblList, dblVal); 314 return; 315 } 316 317 var strList = list as List<string>; 318 if (strList != null) { 319 AddValue(type, strList, strVal); 320 return; 321 } 322 var dtList = list as List<DateTime>; 323 if (dtList != null) { 324 AddValue(type, dtList, dateTimeVal); 325 return; 326 } 327 328 list.Add(strVal); // assumes List<object> 329 } 330 331 private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) { 332 Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double); 333 list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal); 334 } 335 336 private void AddValue(TokenTypeEnum type, List<string> list, string strVal) { 337 // assumes that strVal is always set to the original token read from the input file 338 list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal); 339 } 340 341 private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) { 342 Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime); 343 list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal); 344 } 345 346 private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) { 347 switch (type) { 348 case TokenTypeEnum.String: 349 return new List<string>(estimatedNumberOfLines); 350 case TokenTypeEnum.Double: 351 case TokenTypeEnum.Missing: // assume double columns 352 return new List<double>(estimatedNumberOfLines); 353 case TokenTypeEnum.DateTime: 354 return new List<DateTime>(estimatedNumberOfLines); 355 default: 356 throw new InvalidOperationException(); 357 } 358 } 359 #endregion 202 360 203 361 public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) { … … 253 411 separator = ','; 254 412 } else { 255 char[] disallowedSeparators = new char[] { ',' }; 413 char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail 256 414 // German format (real values) 257 415 numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")); … … 282 440 283 441 #region tokenizer 442 // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character 284 443 internal enum TokenTypeEnum { 285 NewLine, Separator, String, Double, DateTime 286 } 287 288 internal class Token { 289 public TokenTypeEnum type; 290 public string stringValue; 291 public double doubleValue; 292 public DateTime dateTimeValue; 293 294 public Token(TokenTypeEnum type, string value) { 295 this.type = type; 296 stringValue = value; 297 dateTimeValue = DateTime.MinValue; 298 doubleValue = 0.0; 299 } 300 301 public override string ToString() { 302 return stringValue; 303 } 304 } 305 444 NewLine, String, Double, DateTime, Missing 445 } 306 446 307 447 internal class Tokenizer { 308 448 private StreamReader reader; 309 private List<Token> tokens; 449 // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary) 450 private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024]; 451 private string[] stringVals = new string[1024]; 452 private double[] doubleVals = new double[1024]; 453 private DateTime[] dateTimeVals = new DateTime[1024]; 454 private int tokenPos; 455 private int numTokens; 310 456 private NumberFormatInfo numberFormatInfo; 311 457 private DateTimeFormatInfo dateTimeFormatInfo; 312 458 private char separator; 313 private const string INTERNAL_SEPARATOR = "#"; 459 460 // arrays for string.Split() 461 private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default 462 private readonly char[] separators; 314 463 315 464 private int currentLineNumber = 0; … … 323 472 private set { currentLine = value; } 324 473 } 325 326 private Token newlineToken; 327 public Token NewlineToken { 328 get { return newlineToken; } 329 private set { newlineToken = value; } 330 } 331 private Token separatorToken; 332 public Token SeparatorToken { 333 get { return separatorToken; } 334 private set { separatorToken = value; } 474 public long BytesRead { 475 get; 476 private set; 335 477 } 336 478 … … 340 482 this.dateTimeFormatInfo = dateTimeFormatInfo; 341 483 this.separator = separator; 342 separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR); 343 newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine); 344 tokens = new List<Token>(); 484 this.separators = new char[] { separator }; 345 485 ReadNextTokens(); 486 } 487 488 public bool HasNext() { 489 return numTokens > tokenPos || !reader.EndOfStream; 490 } 491 492 public TokenTypeEnum PeekType() { 493 return tokenTypes[tokenPos]; 494 } 495 496 public void Skip() { 497 // simply skips one token without returning the result values 498 tokenPos++; 499 if (numTokens == tokenPos) { 500 ReadNextTokens(); 501 } 502 } 503 504 public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) { 505 type = tokenTypes[tokenPos]; 506 strVal = stringVals[tokenPos]; 507 dblVal = doubleVals[tokenPos]; 508 dateTimeVal = dateTimeVals[tokenPos]; 509 Skip(); 346 510 } 347 511 … … 349 513 if (!reader.EndOfStream) { 350 514 CurrentLine = reader.ReadLine(); 351 var newTokens = from str in Split(CurrentLine)352 let trimmedStr = str.Trim()353 where !string.IsNullOrEmpty(trimmedStr)354 select MakeToken(trimmedStr);355 356 tokens.AddRange(newTokens);357 tokens.Add(NewlineToken);358 515 CurrentLineNumber++; 516 if (reader.BaseStream.CanSeek) { 517 BytesRead = reader.BaseStream.Position; 518 } else { 519 BytesRead += CurrentLine.Length + 2; // guess 520 } 521 int i = 0; 522 if (!string.IsNullOrWhiteSpace(CurrentLine)) { 523 foreach (var tok in Split(CurrentLine)) { 524 TokenTypeEnum type; 525 double doubleVal; 526 DateTime dateTimeValue; 527 type = TokenTypeEnum.String; // default 528 stringVals[i] = tok.Trim(); 529 if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) { 530 type = TokenTypeEnum.Double; 531 doubleVals[i] = doubleVal; 532 } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) { 533 type = TokenTypeEnum.DateTime; 534 dateTimeVals[i] = dateTimeValue; 535 } else if (string.IsNullOrWhiteSpace(tok)) { 536 type = TokenTypeEnum.Missing; 537 } 538 539 // couldn't parse the token as an int or float number or datetime value so return a string token 540 541 tokenTypes[i] = type; 542 i++; 543 544 if (i >= tokenTypes.Length) { 545 // increase buffer size if necessary 546 IncreaseCapacity(ref tokenTypes); 547 IncreaseCapacity(ref doubleVals); 548 IncreaseCapacity(ref stringVals); 549 IncreaseCapacity(ref dateTimeVals); 550 } 551 } 552 } 553 tokenTypes[i] = TokenTypeEnum.NewLine; 554 numTokens = i + 1; 555 tokenPos = 0; 359 556 } 360 557 } 361 558 362 559 private IEnumerable<string> Split(string line) { 363 IEnumerable<string> splitString; 364 if (separator == WHITESPACECHAR) { 365 //separate whitespaces 366 splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries); 367 } else { 368 splitString = line.Split(separator); 369 } 370 int cur = splitString.Count(); 371 foreach (var str in splitString) { 372 yield return str; 373 cur--; 374 // do not return the INTERNAL_SEPARATOR after the last string 375 if (cur != 0) { 376 yield return INTERNAL_SEPARATOR; 377 } 378 } 379 } 380 381 private Token MakeToken(string strToken) { 382 Token token = new Token(TokenTypeEnum.String, strToken); 383 if (strToken.Equals(INTERNAL_SEPARATOR)) { 384 return SeparatorToken; 385 } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) { 386 token.type = TokenTypeEnum.Double; 387 return token; 388 } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) { 389 token.type = TokenTypeEnum.DateTime; 390 return token; 391 } 392 393 // couldn't parse the token as an int or float number or datetime value so return a string token 394 return token; 395 } 396 397 public Token Peek() { 398 return tokens[0]; 399 } 400 401 public Token Next() { 402 Token next = tokens[0]; 403 tokens.RemoveAt(0); 404 if (tokens.Count == 0) { 405 ReadNextTokens(); 406 } 407 return next; 408 } 409 410 public bool HasNext() { 411 return tokens.Count > 0 || !reader.EndOfStream; 560 return separator == WHITESPACECHAR ? 561 line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) : 562 line.Split(separators); 563 } 564 565 private static void IncreaseCapacity<T>(ref T[] arr) { 566 int n = (int)Math.Floor(arr.Length * 1.7); // guess 567 T[] arr2 = new T[n]; 568 Array.Copy(arr, arr2, arr.Length); 569 arr = arr2; 412 570 } 413 571 } … … 415 573 416 574 #region parsing 417 private void Parse(bool columnNamesInFirstLine) {418 if (columnNamesInFirstLine) {419 ParseVariableNames();420 if (!tokenizer.HasNext())421 Error(422 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",423 "", tokenizer.CurrentLineNumber);424 }425 ParseValues();426 if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);427 }428 429 private void ParseValues() {430 while (tokenizer.HasNext()) {431 if (tokenizer.Peek() == tokenizer.NewlineToken) {432 tokenizer.Next();433 } else {434 List<object> row = new List<object>();435 object value = NextValue(tokenizer);436 row.Add(value);437 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {438 Expect(tokenizer.SeparatorToken);439 row.Add(NextValue(tokenizer));440 }441 Expect(tokenizer.NewlineToken);442 // all rows have to have the same number of values443 // the first row defines how many samples are needed444 if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {445 Error("The first row of the dataset has " + rowValues[0].Count + " columns." +446 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",447 tokenizer.CurrentLineNumber);448 }449 rowValues.Add(row);450 }451 }452 }453 454 private object NextValue(Tokenizer tokenizer) {455 if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;456 Token current = tokenizer.Next();457 if (current.type == TokenTypeEnum.Separator) {458 return double.NaN;459 } else if (current.type == TokenTypeEnum.String) {460 return current.stringValue;461 } else if (current.type == TokenTypeEnum.Double) {462 return current.doubleValue;463 } else if (current.type == TokenTypeEnum.DateTime) {464 return current.dateTimeValue;465 }466 // found an unexpected token => throw error467 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);468 // this line is never executed because Error() throws an exception469 throw new InvalidOperationException();470 }471 575 472 576 private void ParseVariableNames() { 473 577 // the first line must contain variable names 474 List<Token> tokens = new List<Token>(); 475 Token valueToken; 476 valueToken = tokenizer.Next(); 477 tokens.Add(valueToken); 478 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 479 Expect(tokenizer.SeparatorToken); 480 valueToken = tokenizer.Next(); 481 if (valueToken != tokenizer.NewlineToken) { 482 tokens.Add(valueToken); 483 } 484 } 485 if (valueToken != tokenizer.NewlineToken) { 486 Expect(tokenizer.NewlineToken); 487 } 488 variableNames = tokens.Select(x => x.stringValue.Trim()).ToList(); 489 } 490 491 private void Expect(Token expectedToken) { 492 Token actualToken = tokenizer.Next(); 493 if (actualToken != expectedToken) { 494 Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber); 495 } 578 List<string> varNames = new List<string>(); 579 580 TokenTypeEnum type; 581 string strVal; 582 double dblVal; 583 DateTime dateTimeVal; 584 585 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 586 587 // the first token must be a variable name 588 if (type != TokenTypeEnum.String) 589 throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type); 590 varNames.Add(strVal); 591 592 while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) { 593 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 594 varNames.Add(strVal); 595 } 596 ExpectType(TokenTypeEnum.NewLine); 597 598 variableNames = varNames; 599 } 600 601 private void ExpectType(TokenTypeEnum expectedToken) { 602 if (tokenizer.PeekType() != expectedToken) 603 throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType()); 604 tokenizer.Skip(); 496 605 } 497 606
Note: See TracChangeset
for help on using the changeset viewer.