Changeset 15170 for stable/HeuristicLab.Problems.Instances.DataAnalysis
- Timestamp:
- 07/07/17 16:44:47 (7 years ago)
- Location:
- stable
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
stable
- Property svn:mergeinfo changed
/trunk/sources merged: 14284-14286,14288,14296,14298,14408
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Problems.Instances.DataAnalysis
- Property svn:mergeinfo changed
/trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis merged: 14285,14296,14408
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r14186 r15170 28 28 using System.IO; 29 29 using System.Linq; 30 using System.Runtime.Serialization;31 30 using System.Text; 32 31 … … 198 197 /// <param name="columnNamesInFirstLine"></param> 199 198 public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) { 200 using (StreamReader reader = new StreamReader(stream, Encoding)) { 199 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 200 201 using (var reader = new StreamReader(stream)) { 201 202 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 203 var strValues = new List<List<string>>(); 202 204 values = new List<IList>(); 203 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 204 205 if (columnNamesInFirstLine) { 206 ParseVariableNames(); 207 if (!tokenizer.HasNext()) 208 Error( 209 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 210 "", tokenizer.CurrentLineNumber); 211 } 212 213 214 // read values... start in first row 205 Prepare(columnNamesInFirstLine, strValues); 206 215 207 int nLinesParsed = 0; 216 208 int colIdx = 0; 217 int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)218 209 while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { 219 210 if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { … … 221 212 222 213 // all rows have to have the same number of values 223 // the first row defines how many samples are needed224 if ( numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row225 else if (colIdx > 0 && numValuesInFirstRow != colIdx) {// read at least one value in the row (support for skipping empty lines)226 Error("The first row of the dataset has " + numValuesInFirstRow+ " columns." + Environment.NewLine +214 // the first row defines how many elements are needed 215 if (colIdx > 0 && values.Count != colIdx) { 216 // read at least one value in the row (support for skipping empty lines) 217 Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + 227 218 "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "", 228 219 tokenizer.CurrentLineNumber); 229 220 } 230 221 OnReport(tokenizer.BytesRead); … … 234 225 } else { 235 226 // read one value 236 TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; 227 TokenTypeEnum type; 228 string strVal; 229 double dblVal; 230 DateTime dateTimeVal; 237 231 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 238 232 239 // initialize columns on the first row (fixing data types as presented in the first row...) 240 if (nLinesParsed == 0) { 241 values.Add(CreateList(type, estimatedNumberOfLines)); 242 } else if (colIdx == values.Count) { 243 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine + 233 if (colIdx == values.Count) { 234 Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + 244 235 "Line " + tokenizer.CurrentLineNumber + " has more columns.", "", 245 236 tokenizer.CurrentLineNumber); 246 237 } 247 238 if (!IsColumnTypeCompatible(values[colIdx], type)) { 248 values[colIdx] = ConvertToStringColumn(values[colIdx]);239 values[colIdx] = strValues[colIdx]; 249 240 } 241 250 242 // add the value to the column 251 AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal); 243 AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal); 244 if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string> 245 strValues[colIdx].Add(strVal); 246 } 247 colIdx++; 252 248 } 253 249 } 254 255 if (!values.Any() || values.First().Count == 0) 256 Error("Couldn't parse data values. Probably because of incorrect number format " +257 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);258 }250 } 251 252 if (!values.Any() || values.First().Count == 0) 253 Error("Couldn't parse data values. Probably because of incorrect number format " + 254 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 259 255 260 256 this.rows = values.First().Count; … … 277 273 // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction) 278 274 GC.Collect(2, GCCollectionMode.Forced); 275 } 276 277 private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) { 278 if (columnNamesInFirstLine) { 279 ParseVariableNames(); 280 if (!tokenizer.HasNext()) 281 Error( 282 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 283 "", tokenizer.CurrentLineNumber); 284 } 285 // read first line to determine types and allocate specific lists 286 // read values... start in first row 287 int colIdx = 0; 288 while (tokenizer.PeekType() != TokenTypeEnum.NewLine) { 289 // read one value 290 TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; 291 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 292 293 // initialize column 294 values.Add(CreateList(type, estimatedNumberOfLines)); 295 if (type == TokenTypeEnum.String) 296 strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string> 297 else 298 strValues.Add(new List<string>(estimatedNumberOfLines)); 299 300 AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal); 301 if (type != TokenTypeEnum.String) 302 strValues[colIdx].Add(strVal); 303 colIdx++; 304 } 305 tokenizer.Skip(); // skip newline 279 306 } 280 307 … … 530 557 type = TokenTypeEnum.Double; 531 558 doubleVals[i] = doubleVal; 532 } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) { 559 } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue) 560 && (dateTimeValue.Year > 1 || dateTimeValue.Month > 1 || dateTimeValue.Day > 1)// if no date is given it is returned as 1.1.0001 -> don't allow this 561 ) { 533 562 type = TokenTypeEnum.DateTime; 534 563 dateTimeVals[i] = dateTimeValue; … … 606 635 607 636 private void Error(string message, string token, int lineNumber) { 608 throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);637 throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber)); 609 638 } 610 639 #endregion 611 612 [Serializable]613 public class DataFormatException : Exception {614 private int line;615 public int Line {616 get { return line; }617 }618 private string token;619 public string Token {620 get { return token; }621 }622 public DataFormatException(string message, string token, int line)623 : base(message + "\nToken: " + token + " (line: " + line + ")") {624 this.token = token;625 this.line = line;626 }627 628 public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }629 }630 640 } 631 641 }
Note: See TracChangeset
for help on using the changeset viewer.