Changeset 17414 for branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
- Timestamp:
- 01/31/20 12:42:47 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r17180 r17414 20 20 #endregion 21 21 22 23 22 using System; 24 23 using System.Collections; … … 29 28 using System.Linq; 30 29 using System.Text; 30 using HeuristicLab.Problems.DataAnalysis; 31 31 32 32 namespace HeuristicLab.Problems.Instances.DataAnalysis { … … 89 89 90 90 public bool AreColumnNamesInFirstLine(string fileName) { 91 NumberFormatInfo numberFormat; 92 DateTimeFormatInfo dateTimeFormatInfo; 93 char separator; 94 DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator); 91 var formatOptions = DetermineFileFormat(fileName); 95 92 using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { 96 return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);93 return AreColumnNamesInFirstLine(stream, formatOptions); 97 94 } 98 95 } 99 96 100 97 public bool AreColumnNamesInFirstLine(Stream stream) { 101 NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo; 102 DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 103 char separator = ','; 104 return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator); 105 } 106 107 public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat, 108 DateTimeFormatInfo dateTimeFormatInfo, char separator) { 98 var formatOptions = new TableFileFormatOptions { 99 NumberFormat = NumberFormatInfo.InvariantInfo, 100 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 101 ColumnSeparator = ',' 102 }; 103 return AreColumnNamesInFirstLine(stream, formatOptions); 104 } 105 106 public bool AreColumnNamesInFirstLine(string fileName, TableFileFormatOptions formatOptions) { 109 107 using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { 110 return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator); 111 } 112 } 113 114 public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat, 115 DateTimeFormatInfo dateTimeFormatInfo, char separator) { 108 return AreColumnNamesInFirstLine(stream, formatOptions); 109 } 110 } 111 112 public bool AreColumnNamesInFirstLine(Stream stream, TableFileFormatOptions formatOptions) { 116 113 using (StreamReader reader = new StreamReader(stream, Encoding)) { 117 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);114 tokenizer = new Tokenizer(reader, formatOptions); 118 115 return (tokenizer.PeekType() != TokenTypeEnum.Double); 119 116 } … … 126 123 /// <param name="columnNamesInFirstLine"></param> 127 124 public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) { 128 NumberFormatInfo numberFormat; 129 DateTimeFormatInfo dateTimeFormatInfo; 130 char separator; 131 DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator); 125 var formatOptions = DetermineFileFormat(fileName); 132 126 EstimateNumberOfLines(fileName); 133 Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);127 Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), formatOptions, columnNamesInFirstLine, lineLimit); 134 128 } 135 129 … … 142 136 /// <param name="separator">defines the separator</param> 143 137 /// <param name="columnNamesInFirstLine"></param> 144 public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {138 public void Parse(string fileName, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) { 145 139 EstimateNumberOfLines(fileName); 146 140 using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { 147 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);141 Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit); 148 142 } 149 143 } … … 182 176 /// <param name="columnNamesInFirstLine"></param> 183 177 public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) { 184 NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo; 185 DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 186 char separator = ','; 187 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 178 var formatOptions = new TableFileFormatOptions { 179 NumberFormat = NumberFormatInfo.InvariantInfo, 180 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 181 ColumnSeparator = ',' 182 }; 183 Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit); 188 184 } 189 185 … … 196 192 /// <param name="separator">defines the separator</param> 197 193 /// <param name="columnNamesInFirstLine"></param> 198 public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {194 public void Parse(Stream stream, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) { 199 195 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 200 196 201 197 using (var reader = new StreamReader(stream)) { 202 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);198 tokenizer = new Tokenizer(reader, formatOptions); 203 199 var strValues = new List<List<string>>(); 204 200 values = new List<IList>(); … … 257 253 this.columns = values.Count; 258 254 255 // see if any string column can be converted to vectors 256 if (formatOptions.VectorSeparator != null) { 257 for (int i = 0; i < values.Count; i++) { 258 if (!(values[i] is List<string> stringList)) continue; 259 260 var strings = new string[stringList.Count][]; 261 var doubles = new double[strings.Length][]; 262 bool allDoubles = true; 263 for (int j = 0; j < strings.Length && allDoubles; j++) { 264 strings[j] = stringList[j].Split(formatOptions.VectorSeparator.Value); 265 doubles[j] = new double[strings[j].Length]; 266 for (int k = 0; k < doubles[j].Length && allDoubles; k++) { 267 allDoubles = double.TryParse(strings[j][k], NumberStyles.Float, formatOptions.NumberFormat, out doubles[j][k]); 268 } 269 } 270 271 if (allDoubles) { 272 var vectorList = new List<DoubleVector>(stringList.Count); 273 for (int j = 0; j < doubles.Length; j++) { 274 vectorList.Add(new DoubleVector(doubles[j])); 275 } 276 277 values[i] = vectorList; 278 } 279 } 280 } 281 259 282 // replace lists with undefined type (object) with double-lists 260 283 for (int i = 0; i < values.Count; i++) { … … 271 294 var stringList = l as List<string>; 272 295 var objList = l as List<object>; 296 var vecList = l as List<DoubleVector>; 273 297 if (dblList != null) dblList.TrimExcess(); 274 298 if (byteList != null) byteList.TrimExcess(); … … 276 300 if (stringList != null) stringList.TrimExcess(); 277 301 if (objList != null) objList.TrimExcess(); 302 if (vecList != null) vecList.TrimExcess(); 278 303 } 279 304 … … 422 447 #endregion 423 448 424 public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {425 DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);426 } 427 428 public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {449 public static TableFileFormatOptions DetermineFileFormat(string path) { 450 return DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)); 451 } 452 453 public static TableFileFormatOptions DetermineFileFormat(Stream stream) { 429 454 using (StreamReader reader = new StreamReader(stream)) { 430 455 // skip first line … … 449 474 // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators) 450 475 if (OccurrencesOf(charCounts, '.') > 10) { 451 numberFormat = NumberFormatInfo.InvariantInfo; 452 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 453 separator = POSSIBLE_SEPARATORS 454 .Where(c => OccurrencesOf(charCounts, c) > 10) 455 .OrderBy(c => -OccurrencesOf(charCounts, c)) 456 .DefaultIfEmpty(' ') 457 .First(); 476 return new TableFileFormatOptions { 477 NumberFormat = NumberFormatInfo.InvariantInfo, 478 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 479 ColumnSeparator = POSSIBLE_SEPARATORS 480 .Where(c => OccurrencesOf(charCounts, c) > 10) 481 .OrderBy(c => -OccurrencesOf(charCounts, c)) 482 .DefaultIfEmpty(' ') 483 .First() 484 }; 458 485 } else if (OccurrencesOf(charCounts, ',') > 10) { 459 486 // no points and many commas … … 470 497 if (tokensWithMultipleCommas > 1) { 471 498 // English format (only integer values) with ',' as separator 472 numberFormat = NumberFormatInfo.InvariantInfo; 473 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 474 separator = ','; 499 return new TableFileFormatOptions { 500 NumberFormat = NumberFormatInfo.InvariantInfo, 501 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 502 ColumnSeparator = ',' 503 }; 475 504 } else { 476 505 char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail 477 506 // German format (real values) 478 numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")); 479 dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")); 480 separator = POSSIBLE_SEPARATORS 481 .Except(disallowedSeparators) 507 return new TableFileFormatOptions { 508 NumberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")), 509 DateTimeFormat = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")), 510 ColumnSeparator = POSSIBLE_SEPARATORS 511 .Except(disallowedSeparators) 512 .Where(c => OccurrencesOf(charCounts, c) > 10) 513 .OrderBy(c => -OccurrencesOf(charCounts, c)) 514 .DefaultIfEmpty(' ') 515 .First() 516 }; 517 } 518 } else { 519 // no points and no commas => English format 520 return new TableFileFormatOptions { 521 NumberFormat = NumberFormatInfo.InvariantInfo, 522 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 523 ColumnSeparator = POSSIBLE_SEPARATORS 482 524 .Where(c => OccurrencesOf(charCounts, c) > 10) 483 525 .OrderBy(c => -OccurrencesOf(charCounts, c)) 484 526 .DefaultIfEmpty(' ') 485 .First(); 486 } 487 } else { 488 // no points and no commas => English format 489 numberFormat = NumberFormatInfo.InvariantInfo; 490 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 491 separator = POSSIBLE_SEPARATORS 492 .Where(c => OccurrencesOf(charCounts, c) > 10) 493 .OrderBy(c => -OccurrencesOf(charCounts, c)) 494 .DefaultIfEmpty(' ') 495 .First(); 527 .First() 528 }; 496 529 } 497 530 } … … 540 573 } 541 574 542 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {575 public Tokenizer(StreamReader reader, TableFileFormatOptions formatOptions) { 543 576 this.reader = reader; 544 this.numberFormatInfo = numberFormatInfo;545 this.dateTimeFormatInfo = dateTimeFormatInfo;546 this.separator = separator;577 this.numberFormatInfo = formatOptions.NumberFormat; 578 this.dateTimeFormatInfo = formatOptions.DateTimeFormat; 579 this.separator = formatOptions.ColumnSeparator; 547 580 this.separators = new char[] { separator }; 548 581 ReadNextTokens();
Note: See TracChangeset
for help on using the changeset viewer.