- Timestamp:
- 11/28/15 17:02:19 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r12012 r13411 28 28 using System.Linq; 29 29 using System.Runtime.Serialization; 30 using System.Security.Policy; 30 31 31 32 namespace HeuristicLab.Problems.Instances.DataAnalysis { … … 104 105 using (StreamReader reader = new StreamReader(stream)) { 105 106 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 106 return tokenizer.Peek ().type!= TokenTypeEnum.Double;107 return tokenizer.PeekType() != TokenTypeEnum.Double; 107 108 } 108 109 } … … 286 287 } 287 288 288 internal class Token { 289 public TokenTypeEnum type; 290 public string stringValue; 291 public double doubleValue; 292 public DateTime dateTimeValue; 293 294 public Token(TokenTypeEnum type, string value) { 295 this.type = type; 296 stringValue = value; 297 dateTimeValue = DateTime.MinValue; 298 doubleValue = 0.0; 299 } 300 301 public override string ToString() { 302 return stringValue; 303 } 304 } 289 290 //internal class Token { 291 // public TokenTypeEnum type; 292 // public string stringValue; 293 // public double doubleValue; 294 // public DateTime dateTimeValue; 295 // 296 // public Token(TokenTypeEnum type, string value) { 297 // this.type = type; 298 // stringValue = value; 299 // dateTimeValue = DateTime.MinValue; 300 // doubleValue = 0.0; 301 // } 302 // 303 // public bool Equals(Token other) { 304 // throw new NotImplementedException(); 305 // } 306 // 307 // public override string ToString() { 308 // return stringValue; 309 // } 310 // 311 // public override bool Equals(object obj) { 312 // return Equals(obj as Token); 313 // } 314 // 315 // public override int GetHashCode() { 316 // throw new NotSupportedException(); 317 // } 318 //} 305 319 306 320 307 321 internal class Tokenizer { 308 322 private StreamReader reader; 309 private List<Token> tokens; 323 // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary) 324 private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024]; 325 private string[] stringVals = new string[1024]; 326 private double[] doubleVals = new double[1024]; 327 private DateTime[] dateTimeVals = new DateTime[1024]; 328 private int tokenPos; 329 private int numTokens; 310 330 private NumberFormatInfo numberFormatInfo; 311 331 private DateTimeFormatInfo dateTimeFormatInfo; … … 324 344 } 325 345 326 private Token newlineToken;327 public Token NewlineToken {328 get { return newlineToken; }329 private set { newlineToken = value; }330 }331 private Token separatorToken;332 public Token SeparatorToken {333 get { return separatorToken; }334 private set { separatorToken = value; }335 }346 // private Token newlineToken; 347 // public Token NewlineToken { 348 // get { return newlineToken; } 349 // private set { newlineToken = value; } 350 // } 351 // private Token separatorToken; 352 // public Token SeparatorToken { 353 // get { return separatorToken; } 354 // private set { separatorToken = value; } 355 // } 336 356 337 357 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) { … … 340 360 this.dateTimeFormatInfo = dateTimeFormatInfo; 341 361 this.separator = separator; 342 separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR); 343 newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine); 344 tokens = new List<Token>(); 362 //separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR); 363 //newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine); 345 364 ReadNextTokens(); 346 365 } … … 349 368 if (!reader.EndOfStream) { 350 369 CurrentLine = reader.ReadLine(); 351 var newTokens = from str in Split(CurrentLine) 352 let trimmedStr = str.Trim() 353 where !string.IsNullOrEmpty(trimmedStr) 354 select MakeToken(trimmedStr); 355 356 tokens.AddRange(newTokens); 357 tokens.Add(NewlineToken); 358 CurrentLineNumber++; 359 } 370 int i = 0; 371 foreach (var tok in Split(CurrentLine)) { 372 var trimmedStr = tok.Trim(); 373 if (!string.IsNullOrEmpty(trimmedStr)) { 374 TokenTypeEnum type = TokenTypeEnum.String; // default 375 stringVals[i] = trimmedStr; 376 double doubleVal; 377 DateTime dateTimeValue; 378 if (trimmedStr.Equals(INTERNAL_SEPARATOR)) { 379 type = TokenTypeEnum.Separator; 380 } else if (double.TryParse(trimmedStr, NumberStyles.Float, numberFormatInfo, out doubleVal)) { 381 type = TokenTypeEnum.Double; 382 doubleVals[i] = doubleVal; 383 } else if (DateTime.TryParse(trimmedStr, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) { 384 type = TokenTypeEnum.DateTime; 385 dateTimeVals[i] = dateTimeValue; 386 } 387 388 // couldn't parse the token as an int or float number or datetime value so return a string token 389 390 tokenTypes[i] = type; 391 i++; 392 393 if (i >= tokenTypes.Length) { 394 // increase buffer size if necessary 395 IncreaseCapacity(ref tokenTypes); 396 IncreaseCapacity(ref doubleVals); 397 IncreaseCapacity(ref stringVals); 398 IncreaseCapacity(ref dateTimeVals); 399 } 400 } 401 } 402 tokenTypes[i] = TokenTypeEnum.NewLine; 403 numTokens = i + 1; 404 tokenPos = 0; 405 } 406 } 407 408 private static void IncreaseCapacity<T>(ref T[] arr) { 409 int n = (int)Math.Floor(arr.Length * 1.7); // guess 410 T[] arr2 = new T[n]; 411 Array.Copy(arr, arr2, arr.Length); 412 arr = arr2; 360 413 } 361 414 362 415 private IEnumerable<string> Split(string line) { 363 IEnumerable<string>splitString;416 string[] splitString; 364 417 if (separator == WHITESPACECHAR) { 365 418 //separate whitespaces … … 368 421 splitString = line.Split(separator); 369 422 } 370 int cur = splitString.Count(); 371 foreach (var str in splitString) { 372 yield return str; 373 cur--; 374 // do not return the INTERNAL_SEPARATOR after the last string 375 if (cur != 0) { 376 yield return INTERNAL_SEPARATOR; 377 } 378 } 379 } 380 381 private Token MakeToken(string strToken) { 382 Token token = new Token(TokenTypeEnum.String, strToken); 383 if (strToken.Equals(INTERNAL_SEPARATOR)) { 384 return SeparatorToken; 385 } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) { 386 token.type = TokenTypeEnum.Double; 387 return token; 388 } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) { 389 token.type = TokenTypeEnum.DateTime; 390 return token; 391 } 392 393 // couldn't parse the token as an int or float number or datetime value so return a string token 394 return token; 395 } 396 397 public Token Peek() { 398 return tokens[0]; 399 } 400 401 public Token Next() { 402 Token next = tokens[0]; 403 tokens.RemoveAt(0); 404 if (tokens.Count == 0) { 423 424 for (int i = 0; i < splitString.Length - 1; i++) { 425 yield return splitString[i]; 426 yield return INTERNAL_SEPARATOR; 427 } 428 // do not return the INTERNAL_SEPARATOR after the last string 429 yield return splitString[splitString.Length - 1]; 430 } 431 432 public TokenTypeEnum PeekType() { 433 return tokenTypes[tokenPos]; 434 } 435 436 public void Skip() { 437 // simply skips one token without returning the result values 438 tokenPos++; 439 if (numTokens == tokenPos) { 405 440 ReadNextTokens(); 406 441 } 407 return next; 442 } 443 444 public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) { 445 type = tokenTypes[tokenPos]; 446 strVal = stringVals[tokenPos]; 447 dblVal = doubleVals[tokenPos]; 448 dateTimeVal = dateTimeVals[tokenPos]; 449 450 Skip(); 408 451 } 409 452 410 453 public bool HasNext() { 411 return tokens.Count > 0|| !reader.EndOfStream;454 return numTokens > tokenPos || !reader.EndOfStream; 412 455 } 413 456 } … … 429 472 private void ParseValues() { 430 473 while (tokenizer.HasNext()) { 431 if (tokenizer.Peek () == tokenizer.NewlineToken) {432 tokenizer. Next();474 if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { 475 tokenizer.Skip(); 433 476 } else { 434 477 List<object> row = new List<object>(); 435 478 object value = NextValue(tokenizer); 436 479 row.Add(value); 437 while (tokenizer.HasNext() && tokenizer.Peek () == tokenizer.SeparatorToken) {438 Expect (tokenizer.SeparatorToken);480 while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) { 481 ExpectType(TokenTypeEnum.Separator); 439 482 row.Add(NextValue(tokenizer)); 440 483 } 441 Expect (tokenizer.NewlineToken);484 ExpectType(TokenTypeEnum.NewLine); 442 485 // all rows have to have the same number of values 443 486 // the first row defines how many samples are needed … … 453 496 454 497 private object NextValue(Tokenizer tokenizer) { 455 if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty; 456 Token current = tokenizer.Next(); 457 if (current.type == TokenTypeEnum.Separator) { 458 return double.NaN; 459 } else if (current.type == TokenTypeEnum.String) { 460 return current.stringValue; 461 } else if (current.type == TokenTypeEnum.Double) { 462 return current.doubleValue; 463 } else if (current.type == TokenTypeEnum.DateTime) { 464 return current.dateTimeValue; 498 if (tokenizer.PeekType() == TokenTypeEnum.Separator || tokenizer.PeekType() == TokenTypeEnum.NewLine) return string.Empty; 499 TokenTypeEnum type; 500 string strVal; 501 double dblVal; 502 DateTime dateTimeVal; 503 504 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 505 switch (type) { 506 case TokenTypeEnum.Separator: return double.NaN; 507 case TokenTypeEnum.String: return strVal; 508 case TokenTypeEnum.Double: return dblVal; 509 case TokenTypeEnum.DateTime: return dateTimeVal; 465 510 } 466 511 // found an unexpected token => throw error 467 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);512 Error("Unexpected token.", strVal, tokenizer.CurrentLineNumber); 468 513 // this line is never executed because Error() throws an exception 469 514 throw new InvalidOperationException(); … … 472 517 private void ParseVariableNames() { 473 518 // the first line must contain variable names 474 List<Token> tokens = new List<Token>(); 475 Token valueToken; 476 valueToken = tokenizer.Next(); 477 tokens.Add(valueToken); 478 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 479 Expect(tokenizer.SeparatorToken); 480 valueToken = tokenizer.Next(); 481 if (valueToken != tokenizer.NewlineToken) { 482 tokens.Add(valueToken); 483 } 484 } 485 if (valueToken != tokenizer.NewlineToken) { 486 Expect(tokenizer.NewlineToken); 487 } 488 variableNames = tokens.Select(x => x.stringValue.Trim()).ToList(); 489 } 490 491 private void Expect(Token expectedToken) { 492 Token actualToken = tokenizer.Next(); 493 if (actualToken != expectedToken) { 494 Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber); 495 } 519 List<string> varNames = new List<string>(); 520 521 TokenTypeEnum type; 522 string strVal; 523 double dblVal; 524 DateTime dateTimeVal; 525 526 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 527 528 // the first token must be a variable name 529 if (type != TokenTypeEnum.String) 530 throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type); 531 varNames.Add(strVal); 532 533 while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) { 534 ExpectType(TokenTypeEnum.Separator); 535 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 536 varNames.Add(strVal); 537 } 538 ExpectType(TokenTypeEnum.NewLine); 539 540 variableNames = varNames; 541 } 542 543 private void ExpectType(TokenTypeEnum expectedToken) { 544 if (tokenizer.PeekType() != expectedToken) 545 throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType()); 546 tokenizer.Skip(); 496 547 } 497 548
Note: See TracChangeset
for help on using the changeset viewer.