- Timestamp:
- 09/21/16 09:49:22 (8 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r14285 r14296 198 198 /// <param name="columnNamesInFirstLine"></param> 199 199 public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) { 200 using (StreamReader reader = new StreamReader(stream, Encoding)) { 200 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 201 202 using (var reader = new StreamReader(stream)) { 201 203 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 204 var strValues = new List<List<string>>(); 202 205 values = new List<IList>(); 203 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 204 205 if (columnNamesInFirstLine) { 206 ParseVariableNames(); 207 if (!tokenizer.HasNext()) 208 Error( 209 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 210 "", tokenizer.CurrentLineNumber); 211 } 212 213 214 // read values... start in first row 206 Prepare(columnNamesInFirstLine, strValues); 207 215 208 int nLinesParsed = 0; 216 209 int colIdx = 0; 217 int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)218 210 while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { 219 211 if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { … … 221 213 222 214 // all rows have to have the same number of values 223 // the first row defines how many samples are needed224 if ( numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row225 else if (colIdx > 0 && numValuesInFirstRow != colIdx) {// read at least one value in the row (support for skipping empty lines)226 Error("The first row of the dataset has " + numValuesInFirstRow+ " columns." + Environment.NewLine +215 // the first row defines how many elements are needed 216 if (colIdx > 0 && values.Count != colIdx) { 217 // read at least one value in the row (support for skipping empty lines) 218 Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + 227 219 "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "", 228 220 tokenizer.CurrentLineNumber); 229 221 } 230 222 OnReport(tokenizer.BytesRead); … … 234 226 } else { 235 227 // read one value 236 TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; 228 TokenTypeEnum type; 229 string strVal; 230 double dblVal; 231 DateTime dateTimeVal; 237 232 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 238 233 239 // initialize columns on the first row (fixing data types as presented in the first row...) 240 if (nLinesParsed == 0) { 241 values.Add(CreateList(type, estimatedNumberOfLines)); 242 } else if (colIdx == values.Count) { 243 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine + 234 if (colIdx == values.Count) { 235 Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + 244 236 "Line " + tokenizer.CurrentLineNumber + " has more columns.", "", 245 237 tokenizer.CurrentLineNumber); 246 238 } 247 239 if (!IsColumnTypeCompatible(values[colIdx], type)) { 248 values[colIdx] = ConvertToStringColumn(values[colIdx]);240 values[colIdx] = strValues[colIdx]; 249 241 } 242 250 243 // add the value to the column 251 AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal); 244 AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal); 245 if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string> 246 strValues[colIdx].Add(strVal); 247 } 248 colIdx++; 252 249 } 253 250 } 254 255 if (!values.Any() || values.First().Count == 0) 256 Error("Couldn't parse data values. Probably because of incorrect number format " +257 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);258 }251 } 252 253 if (!values.Any() || values.First().Count == 0) 254 Error("Couldn't parse data values. Probably because of incorrect number format " + 255 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 259 256 260 257 this.rows = values.First().Count; … … 277 274 // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction) 278 275 GC.Collect(2, GCCollectionMode.Forced); 276 } 277 278 private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) { 279 if (columnNamesInFirstLine) { 280 ParseVariableNames(); 281 if (!tokenizer.HasNext()) 282 Error( 283 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 284 "", tokenizer.CurrentLineNumber); 285 } 286 // read first line to determine types and allocate specific lists 287 // read values... start in first row 288 int colIdx = 0; 289 while (tokenizer.PeekType() != TokenTypeEnum.NewLine) { 290 // read one value 291 TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; 292 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 293 294 // initialize column 295 values.Add(CreateList(type, estimatedNumberOfLines)); 296 if (type == TokenTypeEnum.String) 297 strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string> 298 else 299 strValues.Add(new List<string>(estimatedNumberOfLines)); 300 301 AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal); 302 if (type != TokenTypeEnum.String) 303 strValues[colIdx].Add(strVal); 304 colIdx++; 305 } 306 tokenizer.Skip(); // skip newline 279 307 } 280 308 … … 530 558 type = TokenTypeEnum.Double; 531 559 doubleVals[i] = doubleVal; 532 } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) { 560 } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue) 561 && dateTimeValue.Year > 1 && dateTimeValue.Month > 1 && dateTimeValue.Day > 1 // if no date is given it is returned as 1.1.0001 -> don't allow this 562 ) { 533 563 type = TokenTypeEnum.DateTime; 534 564 dateTimeVals[i] = dateTimeValue;
Note: See TracChangeset
for help on using the changeset viewer.