- Timestamp:
- 01/31/20 12:42:47 (5 years ago)
- Location:
- branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3
- Files:
-
- 5 added
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Classification/ResourceClassificationInstanceProvider.cs
r17180 r17414 40 40 using (var instancesZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read)) { 41 41 var entry = instancesZipFile.GetEntry(descriptor.ResourceName); 42 NumberFormatInfo numberFormat; 43 DateTimeFormatInfo dateFormat; 44 char separator; 45 using (Stream stream = entry.Open()) { 46 TableFileParser.DetermineFileFormat(stream, out numberFormat, out dateFormat, out separator); 47 } 42 43 var formatOptions = GetFormatOptions(entry); 48 44 49 45 TableFileParser csvFileParser = new TableFileParser(); 50 46 using (Stream stream = entry.Open()) { 51 csvFileParser.Parse(stream, numberFormat, dateFormat, separator, true);47 csvFileParser.Parse(stream, formatOptions, true); 52 48 } 53 49 … … 65 61 .Where(x => Regex.Match(x, @".*\.Data\." + fileName).Success).SingleOrDefault(); 66 62 } 63 64 protected virtual TableFileFormatOptions GetFormatOptions(ZipArchiveEntry entry) { 65 using (Stream stream = entry.Open()) { 66 return TableFileParser.DetermineFileFormat(stream); 67 } 68 } 67 69 } 68 70 } -
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/DataAnalysisInstanceProvider.cs
r17180 r17414 45 45 OnProgressChanged(e / (double)fileSize); 46 46 }; 47 csvFileParser.Parse(path, csvFormat.NumberFormatInfo, csvFormat.DateTimeFormatInfo, csvFormat.Separator, csvFormat.VariableNamesAvailable); 47 var formatOptions = new TableFileFormatOptions { 48 NumberFormat = csvFormat.NumberFormatInfo, DateTimeFormat = csvFormat.DateTimeFormatInfo, ColumnSeparator = csvFormat.Separator 49 }; 50 csvFileParser.Parse(path, formatOptions, csvFormat.VariableNamesAvailable); 48 51 return ImportData(path, type, csvFileParser); 49 52 } -
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/HeuristicLab.Problems.Instances.DataAnalysis-3.3.csproj
r17401 r17414 178 178 <Compile Include="Regression\Physics\PhysicsInstanceProvider.cs" /> 179 179 <Compile Include="Regression\Physics\RocketFuelFlow.cs" /> 180 <Compile Include="Regression\UCITimeSeries\GasFlowModulation.cs" /> 181 <Compile Include="Regression\UCITimeSeries\UCITimeSeriesProvider.cs" /> 180 182 <Compile Include="Regression\VariableNetworks\LinearVariableNetwork.cs" /> 181 183 <Compile Include="Regression\VariableNetworks\GaussianProcessVariableNetwork.cs" /> … … 263 265 <Compile Include="Regression\Vladislavleva\UnwrappedBallFunctionFiveDimensional.cs" /> 264 266 <Compile Include="Regression\Vladislavleva\VladislavlevaInstanceProvider.cs" /> 267 <Compile Include="TableFileFormatOptions.cs" /> 265 268 <Compile Include="TableFileParser.cs" /> 266 269 <Compile Include="TimeSeries\CSV\TimeSeriesPrognosisCSVInstanceProvider.cs" /> … … 278 281 <EmbeddedResource Include="Regression\Data\MibaFriction.zip" /> 279 282 <EmbeddedResource Include="Regression\Data\PennML.zip" /> 283 <EmbeddedResource Include="Regression\Data\UCITimeSeries.zip" /> 280 284 </ItemGroup> 281 285 <ItemGroup> -
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/PennML/PennMLRegressionInstanceProvider.cs
r17180 r17414 22 22 using System; 23 23 using System.Collections.Generic; 24 using System.Globalization;25 24 using System.IO; 26 25 using System.IO.Compression; … … 57 56 using (var instancesZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read)) { 58 57 foreach (var entry in instancesZipFile.Entries) { 59 NumberFormatInfo numberFormat; 60 DateTimeFormatInfo dateFormat; 61 char separator; 62 using (var stream = entry.Open()) { 63 // the method below disposes the stream 64 TableFileParser.DetermineFileFormat(stream, out numberFormat, out dateFormat, out separator); 65 } 58 var formatOptions = GetFormatOptions(entry); 66 59 67 60 using (var stream = entry.Open()) { … … 70 63 71 64 // by convention each dataset from the PennML collection reserves the last column for the target 72 var variableNames = header.Split( separator);65 var variableNames = header.Split(formatOptions.ColumnSeparator); 73 66 var allowedInputVariables = variableNames.Take(variableNames.Length - 1); 74 67 var target = variableNames.Last(); -
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/ResourceRegressionInstanceProvider.cs
r17180 r17414 39 39 using (var instancesZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read)) { 40 40 var entry = instancesZipFile.GetEntry(descriptor.ResourceName); 41 NumberFormatInfo numberFormat; 42 DateTimeFormatInfo dateFormat; 43 char separator; 44 using (Stream stream = entry.Open()) { 45 TableFileParser.DetermineFileFormat(stream, out numberFormat, out dateFormat, out separator); 46 } 41 var formatOptions = GetFormatOptions(entry); 47 42 48 43 TableFileParser csvFileParser = new TableFileParser(); 49 44 using (Stream stream = entry.Open()) { 50 csvFileParser.Parse(stream, numberFormat, dateFormat, separator, true);45 csvFileParser.Parse(stream, formatOptions, true); 51 46 } 52 47 … … 64 59 .Where(x => Regex.Match(x, @".*\.Data\." + fileName).Success).SingleOrDefault(); 65 60 } 61 62 protected virtual TableFileFormatOptions GetFormatOptions(ZipArchiveEntry entry) { 63 using (Stream stream = entry.Open()) { 64 return TableFileParser.DetermineFileFormat(stream); 65 } 66 } 66 67 } 67 68 } -
branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r17180 r17414 20 20 #endregion 21 21 22 23 22 using System; 24 23 using System.Collections; … … 29 28 using System.Linq; 30 29 using System.Text; 30 using HeuristicLab.Problems.DataAnalysis; 31 31 32 32 namespace HeuristicLab.Problems.Instances.DataAnalysis { … … 89 89 90 90 public bool AreColumnNamesInFirstLine(string fileName) { 91 NumberFormatInfo numberFormat; 92 DateTimeFormatInfo dateTimeFormatInfo; 93 char separator; 94 DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator); 91 var formatOptions = DetermineFileFormat(fileName); 95 92 using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { 96 return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);93 return AreColumnNamesInFirstLine(stream, formatOptions); 97 94 } 98 95 } 99 96 100 97 public bool AreColumnNamesInFirstLine(Stream stream) { 101 NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo; 102 DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 103 char separator = ','; 104 return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator); 105 } 106 107 public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat, 108 DateTimeFormatInfo dateTimeFormatInfo, char separator) { 98 var formatOptions = new TableFileFormatOptions { 99 NumberFormat = NumberFormatInfo.InvariantInfo, 100 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 101 ColumnSeparator = ',' 102 }; 103 return AreColumnNamesInFirstLine(stream, formatOptions); 104 } 105 106 public bool AreColumnNamesInFirstLine(string fileName, TableFileFormatOptions formatOptions) { 109 107 using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { 110 return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator); 111 } 112 } 113 114 public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat, 115 DateTimeFormatInfo dateTimeFormatInfo, char separator) { 108 return AreColumnNamesInFirstLine(stream, formatOptions); 109 } 110 } 111 112 public bool AreColumnNamesInFirstLine(Stream stream, TableFileFormatOptions formatOptions) { 116 113 using (StreamReader reader = new StreamReader(stream, Encoding)) { 117 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);114 tokenizer = new Tokenizer(reader, formatOptions); 118 115 return (tokenizer.PeekType() != TokenTypeEnum.Double); 119 116 } … … 126 123 /// <param name="columnNamesInFirstLine"></param> 127 124 public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) { 128 NumberFormatInfo numberFormat; 129 DateTimeFormatInfo dateTimeFormatInfo; 130 char separator; 131 DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator); 125 var formatOptions = DetermineFileFormat(fileName); 132 126 EstimateNumberOfLines(fileName); 133 Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);127 Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), formatOptions, columnNamesInFirstLine, lineLimit); 134 128 } 135 129 … … 142 136 /// <param name="separator">defines the separator</param> 143 137 /// <param name="columnNamesInFirstLine"></param> 144 public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {138 public void Parse(string fileName, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) { 145 139 EstimateNumberOfLines(fileName); 146 140 using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { 147 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);141 Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit); 148 142 } 149 143 } … … 182 176 /// <param name="columnNamesInFirstLine"></param> 183 177 public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) { 184 NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo; 185 DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 186 char separator = ','; 187 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 178 var formatOptions = new TableFileFormatOptions { 179 NumberFormat = NumberFormatInfo.InvariantInfo, 180 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 181 ColumnSeparator = ',' 182 }; 183 Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit); 188 184 } 189 185 … … 196 192 /// <param name="separator">defines the separator</param> 197 193 /// <param name="columnNamesInFirstLine"></param> 198 public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {194 public void Parse(Stream stream, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) { 199 195 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 200 196 201 197 using (var reader = new StreamReader(stream)) { 202 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);198 tokenizer = new Tokenizer(reader, formatOptions); 203 199 var strValues = new List<List<string>>(); 204 200 values = new List<IList>(); … … 257 253 this.columns = values.Count; 258 254 255 // see if any string column can be converted to vectors 256 if (formatOptions.VectorSeparator != null) { 257 for (int i = 0; i < values.Count; i++) { 258 if (!(values[i] is List<string> stringList)) continue; 259 260 var strings = new string[stringList.Count][]; 261 var doubles = new double[strings.Length][]; 262 bool allDoubles = true; 263 for (int j = 0; j < strings.Length && allDoubles; j++) { 264 strings[j] = stringList[j].Split(formatOptions.VectorSeparator.Value); 265 doubles[j] = new double[strings[j].Length]; 266 for (int k = 0; k < doubles[j].Length && allDoubles; k++) { 267 allDoubles = double.TryParse(strings[j][k], NumberStyles.Float, formatOptions.NumberFormat, out doubles[j][k]); 268 } 269 } 270 271 if (allDoubles) { 272 var vectorList = new List<DoubleVector>(stringList.Count); 273 for (int j = 0; j < doubles.Length; j++) { 274 vectorList.Add(new DoubleVector(doubles[j])); 275 } 276 277 values[i] = vectorList; 278 } 279 } 280 } 281 259 282 // replace lists with undefined type (object) with double-lists 260 283 for (int i = 0; i < values.Count; i++) { … … 271 294 var stringList = l as List<string>; 272 295 var objList = l as List<object>; 296 var vecList = l as List<DoubleVector>; 273 297 if (dblList != null) dblList.TrimExcess(); 274 298 if (byteList != null) byteList.TrimExcess(); … … 276 300 if (stringList != null) stringList.TrimExcess(); 277 301 if (objList != null) objList.TrimExcess(); 302 if (vecList != null) vecList.TrimExcess(); 278 303 } 279 304 … … 422 447 #endregion 423 448 424 public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {425 DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);426 } 427 428 public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {449 public static TableFileFormatOptions DetermineFileFormat(string path) { 450 return DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)); 451 } 452 453 public static TableFileFormatOptions DetermineFileFormat(Stream stream) { 429 454 using (StreamReader reader = new StreamReader(stream)) { 430 455 // skip first line … … 449 474 // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators) 450 475 if (OccurrencesOf(charCounts, '.') > 10) { 451 numberFormat = NumberFormatInfo.InvariantInfo; 452 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 453 separator = POSSIBLE_SEPARATORS 454 .Where(c => OccurrencesOf(charCounts, c) > 10) 455 .OrderBy(c => -OccurrencesOf(charCounts, c)) 456 .DefaultIfEmpty(' ') 457 .First(); 476 return new TableFileFormatOptions { 477 NumberFormat = NumberFormatInfo.InvariantInfo, 478 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 479 ColumnSeparator = POSSIBLE_SEPARATORS 480 .Where(c => OccurrencesOf(charCounts, c) > 10) 481 .OrderBy(c => -OccurrencesOf(charCounts, c)) 482 .DefaultIfEmpty(' ') 483 .First() 484 }; 458 485 } else if (OccurrencesOf(charCounts, ',') > 10) { 459 486 // no points and many commas … … 470 497 if (tokensWithMultipleCommas > 1) { 471 498 // English format (only integer values) with ',' as separator 472 numberFormat = NumberFormatInfo.InvariantInfo; 473 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 474 separator = ','; 499 return new TableFileFormatOptions { 500 NumberFormat = NumberFormatInfo.InvariantInfo, 501 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 502 ColumnSeparator = ',' 503 }; 475 504 } else { 476 505 char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail 477 506 // German format (real values) 478 numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")); 479 dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")); 480 separator = POSSIBLE_SEPARATORS 481 .Except(disallowedSeparators) 507 return new TableFileFormatOptions { 508 NumberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")), 509 DateTimeFormat = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")), 510 ColumnSeparator = POSSIBLE_SEPARATORS 511 .Except(disallowedSeparators) 512 .Where(c => OccurrencesOf(charCounts, c) > 10) 513 .OrderBy(c => -OccurrencesOf(charCounts, c)) 514 .DefaultIfEmpty(' ') 515 .First() 516 }; 517 } 518 } else { 519 // no points and no commas => English format 520 return new TableFileFormatOptions { 521 NumberFormat = NumberFormatInfo.InvariantInfo, 522 DateTimeFormat = DateTimeFormatInfo.InvariantInfo, 523 ColumnSeparator = POSSIBLE_SEPARATORS 482 524 .Where(c => OccurrencesOf(charCounts, c) > 10) 483 525 .OrderBy(c => -OccurrencesOf(charCounts, c)) 484 526 .DefaultIfEmpty(' ') 485 .First(); 486 } 487 } else { 488 // no points and no commas => English format 489 numberFormat = NumberFormatInfo.InvariantInfo; 490 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 491 separator = POSSIBLE_SEPARATORS 492 .Where(c => OccurrencesOf(charCounts, c) > 10) 493 .OrderBy(c => -OccurrencesOf(charCounts, c)) 494 .DefaultIfEmpty(' ') 495 .First(); 527 .First() 528 }; 496 529 } 497 530 } … … 540 573 } 541 574 542 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {575 public Tokenizer(StreamReader reader, TableFileFormatOptions formatOptions) { 543 576 this.reader = reader; 544 this.numberFormatInfo = numberFormatInfo;545 this.dateTimeFormatInfo = dateTimeFormatInfo;546 this.separator = separator;577 this.numberFormatInfo = formatOptions.NumberFormat; 578 this.dateTimeFormatInfo = formatOptions.DateTimeFormat; 579 this.separator = formatOptions.ColumnSeparator; 547 580 this.separators = new char[] { separator }; 548 581 ReadNextTokens();
Note: See TracChangeset
for help on using the changeset viewer.