- Timestamp:
- 12/10/15 17:12:45 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r13445 r13447 142 142 var len = new System.IO.FileInfo(fileName).Length; 143 143 var buf = new char[64 * 1024]; 144 using (var reader = new StreamReader(fileName)) {144 using (var reader = new StreamReader(fileName)) { 145 145 reader.ReadBlock(buf, 0, buf.Length); 146 146 } … … 188 188 using (StreamReader reader = new StreamReader(stream)) { 189 189 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 190 // parse the file line by line191 190 values = new List<IList>(); 192 191 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 193 foreach (var row in Parse(columnNamesInFirstLine, lineLimit)) { 194 columns = row.Count; 195 // on the first row we create our lists for column-oriented storage 196 if (!values.Any()) { 197 foreach (var obj in row) { 198 // create a list type matching the object type and add first element 199 if (obj == null) { 200 var l = new List<object>(estimatedNumberOfLines); 201 values.Add(l); 202 l.Add(obj); 203 } else if (obj is double) { 204 var l = new List<double>(estimatedNumberOfLines); 205 values.Add(l); 206 l.Add((double)obj); 207 } else if (obj is DateTime) { 208 var l = new List<DateTime>(estimatedNumberOfLines); 209 values.Add(l); 210 l.Add((DateTime)obj); 211 } else if (obj is string) { 212 var l = new List<string>(estimatedNumberOfLines); 213 values.Add(l); 214 l.Add((string)obj); 215 } else throw new InvalidOperationException(); 192 193 if (columnNamesInFirstLine) { 194 ParseVariableNames(); 195 if (!tokenizer.HasNext()) 196 Error( 197 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 198 "", tokenizer.CurrentLineNumber); 199 } 200 201 202 // read values... start in first row 203 int nLinesParsed = 0; 204 int colIdx = 0; 205 int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1) 206 while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { 207 if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { 208 tokenizer.Skip(); 209 210 // all rows have to have the same number of values 211 // the first row defines how many samples are needed 212 if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row 213 else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines) 214 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine + 215 "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "", 216 tokenizer.CurrentLineNumber); 216 217 } 217 // fill with initial value 218 OnReport(tokenizer.BytesRead); 219 220 nLinesParsed++; 221 colIdx = 0; 218 222 } else { 219 // the columns are already there -> try to add values 220 int columnIndex = 0; 221 foreach (object element in row) { 222 if (values[columnIndex] is List<double> && !(element is double)) 223 values[columnIndex].Add(double.NaN); 224 else if (values[columnIndex] is List<DateTime> && !(element is DateTime)) 225 values[columnIndex].Add(DateTime.MinValue); 226 else if (values[columnIndex] is List<string> && !(element is string)) 227 values[columnIndex].Add(element.ToString()); 228 else 229 values[columnIndex].Add(element); 230 columnIndex++; 223 // read one value 224 TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; 225 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 226 227 // initialize columns on the first row (fixing data types as presented in the first row...) 228 if (nLinesParsed == 0) { 229 values.Add(CreateList(type, estimatedNumberOfLines)); 230 } else if (colIdx == values.Count) { 231 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine + 232 "Line " + tokenizer.CurrentLineNumber + " has more columns.", "", 233 tokenizer.CurrentLineNumber); 231 234 } 235 // add the value to the column 236 AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal); 232 237 } 233 238 } 234 239 235 240 if (!values.Any() || values.First().Count == 0) 236 Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 241 Error("Couldn't parse data values. Probably because of incorrect number format " + 242 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 237 243 } 238 244 239 245 this.rows = values.First().Count; 246 this.columns = values.Count; 240 247 241 248 // after everything has been parsed make sure the lists are as compact as possible … … 256 263 GC.Collect(2, GCCollectionMode.Forced); 257 264 } 265 266 #region type-dependent dispatch 267 private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) { 268 switch (type) { 269 case TokenTypeEnum.Double: 270 AddDoubleToList(list, dblVal); 271 break; 272 case TokenTypeEnum.String: 273 AddStringToList(list, strVal); 274 break; 275 case TokenTypeEnum.DateTime: 276 AddDateTimeToList(list, dateTimeVal); 277 break; 278 default: 279 throw new InvalidOperationException(); 280 } 281 } 282 283 private void AddDoubleToList(IList list, double dblVal) { 284 var dblList = list as List<double>; 285 if (dblList != null) dblList.Add(dblVal); 286 else { 287 var strList = list as List<string>; 288 if (strList != null) strList.Add(dblVal.ToString()); 289 else list.Add(null); 290 } 291 } 292 293 private void AddStringToList(IList list, string strVal) { 294 var strList = list as List<string>; 295 if (strList != null) strList.Add(strVal); 296 else { 297 var dblList = list as List<double>; 298 if (dblList != null) dblList.Add(double.NaN); 299 else list.Add(null); 300 } 301 } 302 303 private void AddDateTimeToList(IList list, DateTime dateTimeVal) { 304 var dateTimeList = list as List<DateTime>; 305 if (dateTimeList != null) dateTimeList.Add(dateTimeVal); 306 else { 307 var dblList = list as List<double>; 308 if (dblList != null) dblList.Add(double.NaN); 309 else { 310 var strList = list as List<string>; 311 if (strList != null) strList.Add(dateTimeVal.ToString()); 312 else list.Add(null); 313 } 314 } 315 } 316 317 private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) { 318 switch (type) { 319 case TokenTypeEnum.String: 320 return new List<string>(estimatedNumberOfLines); 321 case TokenTypeEnum.Double: 322 return new List<double>(estimatedNumberOfLines); 323 case TokenTypeEnum.DateTime: 324 return new List<DateTime>(estimatedNumberOfLines); 325 default: 326 throw new InvalidOperationException(); 327 } 328 } 329 #endregion 258 330 259 331 public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) { … … 338 410 339 411 #region tokenizer 412 // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character 340 413 internal enum TokenTypeEnum { 341 NewLine, S eparator, String, Double, DateTime414 NewLine, String, Double, DateTime 342 415 } 343 416 … … 354 427 private DateTimeFormatInfo dateTimeFormatInfo; 355 428 private char separator; 356 private const string INTERNAL_SEPARATOR = "#"; 429 430 // arrays for string.Split() 431 private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default 432 private readonly char[] separators; 357 433 358 434 private int currentLineNumber = 0; … … 376 452 this.dateTimeFormatInfo = dateTimeFormatInfo; 377 453 this.separator = separator; 454 this.separators = new char[] { separator }; 378 455 ReadNextTokens(); 456 } 457 458 public bool HasNext() { 459 return numTokens > tokenPos || !reader.EndOfStream; 460 } 461 462 public TokenTypeEnum PeekType() { 463 return tokenTypes[tokenPos]; 464 } 465 466 public void Skip() { 467 // simply skips one token without returning the result values 468 tokenPos++; 469 if (numTokens == tokenPos) { 470 ReadNextTokens(); 471 } 472 } 473 474 public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) { 475 type = tokenTypes[tokenPos]; 476 strVal = stringVals[tokenPos]; 477 dblVal = doubleVals[tokenPos]; 478 dateTimeVal = dateTimeVals[tokenPos]; 479 Skip(); 379 480 } 380 481 … … 382 483 if (!reader.EndOfStream) { 383 484 CurrentLine = reader.ReadLine(); 485 CurrentLineNumber++; 384 486 try { 385 487 BytesRead = reader.BaseStream.Position; … … 390 492 } 391 493 int i = 0; 392 foreach (var tok in Split(CurrentLine)) { 393 var trimmedStr = tok.Trim(); 394 if (!string.IsNullOrEmpty(trimmedStr)) { 395 TokenTypeEnum type = TokenTypeEnum.String; // default 396 stringVals[i] = trimmedStr; 494 if (!string.IsNullOrWhiteSpace(CurrentLine)) { 495 foreach (var tok in Split(CurrentLine)) { 496 TokenTypeEnum type; 397 497 double doubleVal; 398 498 DateTime dateTimeValue; 399 if (trimmedStr.Equals(INTERNAL_SEPARATOR)) {400 type = TokenTypeEnum.Separator;401 } else if (double.TryParse(trimmedStr, NumberStyles.Float, numberFormatInfo, out doubleVal)) {499 type = TokenTypeEnum.String; // default 500 stringVals[i] = tok.Trim(); 501 if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) { 402 502 type = TokenTypeEnum.Double; 403 503 doubleVals[i] = doubleVal; 404 } else if (DateTime.TryParse(t rimmedStr, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {504 } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) { 405 505 type = TokenTypeEnum.DateTime; 406 506 dateTimeVals[i] = dateTimeValue; 407 507 } 408 508 409 // couldn't parse the token as an int or float number 509 // couldn't parse the token as an int or float number or datetime value so return a string token 410 510 411 511 tokenTypes[i] = type; … … 427 527 } 428 528 529 private IEnumerable<string> Split(string line) { 530 return separator == WHITESPACECHAR ? 531 line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) : 532 line.Split(separators); 533 } 534 429 535 private static void IncreaseCapacity<T>(ref T[] arr) { 430 536 int n = (int)Math.Floor(arr.Length * 1.7); // guess … … 433 539 arr = arr2; 434 540 } 435 436 private IEnumerable<string> Split(string line) {437 string[] splitString;438 if (separator == WHITESPACECHAR) {439 //separate whitespaces440 splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);441 } else {442 splitString = line.Split(separator);443 }444 445 for (int i = 0; i < splitString.Length - 1; i++) {446 yield return splitString[i];447 yield return INTERNAL_SEPARATOR;448 }449 // do not return the INTERNAL_SEPARATOR after the last string450 yield return splitString[splitString.Length - 1];451 }452 453 public TokenTypeEnum PeekType() {454 return tokenTypes[tokenPos];455 }456 457 public void Skip() {458 // simply skips one token without returning the result values459 tokenPos++;460 if (numTokens == tokenPos) {461 ReadNextTokens();462 }463 }464 465 public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {466 type = tokenTypes[tokenPos];467 strVal = stringVals[tokenPos];468 dblVal = doubleVals[tokenPos];469 dateTimeVal = dateTimeVals[tokenPos];470 Skip();471 }472 473 public bool HasNext() {474 return numTokens > tokenPos || !reader.EndOfStream;475 }476 541 } 477 542 #endregion 478 543 479 544 #region parsing 480 private IEnumerable<List<object>> Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit 481 if (columnNamesInFirstLine) { 482 ParseVariableNames(); 483 if (!tokenizer.HasNext()) 484 Error( 485 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 486 "", tokenizer.CurrentLineNumber); 487 } 488 return ParseValues(lineLimit); 489 } 490 491 private IEnumerable<List<object>> ParseValues(int lineLimit = -1) { 492 int nLinesParsed = 0; 493 int numValuesInFirstRow = -1; 494 while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { 495 if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { 496 tokenizer.Skip(); 497 nLinesParsed++; 498 } else { 499 List<object> row = new List<object>(); 500 object value = NextValue(tokenizer); 501 row.Add(value); 502 while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) { 503 ExpectType(TokenTypeEnum.Separator); 504 row.Add(NextValue(tokenizer)); 505 } 506 ExpectType(TokenTypeEnum.NewLine); 507 nLinesParsed++; 508 // all rows have to have the same number of values 509 // the first row defines how many samples are needed 510 if (numValuesInFirstRow < 0) numValuesInFirstRow = row.Count; 511 else if (numValuesInFirstRow != row.Count) { 512 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + 513 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", 514 tokenizer.CurrentLineNumber); 515 } 516 yield return row; 517 } 518 519 OnReport(tokenizer.BytesRead); 520 } 521 } 522 523 private object NextValue(Tokenizer tokenizer) { 524 if (tokenizer.PeekType() == TokenTypeEnum.Separator || tokenizer.PeekType() == TokenTypeEnum.NewLine) return string.Empty; 545 546 private void ParseVariableNames() { 547 // the first line must contain variable names 548 List<string> varNames = new List<string>(); 549 525 550 TokenTypeEnum type; 526 551 string strVal; … … 529 554 530 555 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 531 switch (type) {532 case TokenTypeEnum.Separator: return double.NaN;533 case TokenTypeEnum.String: return strVal;534 case TokenTypeEnum.Double: return dblVal;535 case TokenTypeEnum.DateTime: return dateTimeVal;536 }537 // found an unexpected token => throw error538 Error("Unexpected token.", strVal, tokenizer.CurrentLineNumber);539 // this line is never executed because Error() throws an exception540 throw new InvalidOperationException();541 }542 543 private void ParseVariableNames() {544 // the first line must contain variable names545 List<string> varNames = new List<string>();546 547 TokenTypeEnum type;548 string strVal;549 double dblVal;550 DateTime dateTimeVal;551 552 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);553 556 554 557 // the first token must be a variable name … … 557 560 varNames.Add(strVal); 558 561 559 while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) { 560 ExpectType(TokenTypeEnum.Separator); 562 while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) { 561 563 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 562 564 varNames.Add(strVal);
Note: See TracChangeset
for help on using the changeset viewer.