Changeset 13440 for trunk/sources
- Timestamp:
- 12/07/15 17:25:31 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r13414 r13440 28 28 using System.Linq; 29 29 using System.Runtime.Serialization; 30 using System.Text; 30 31 31 32 namespace HeuristicLab.Problems.Instances.DataAnalysis { … … 36 37 private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR }; 37 38 private Tokenizer tokenizer; 38 private List<List<object>> rowValues;39 private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file 39 40 40 41 private int rows; … … 72 73 73 74 public TableFileParser() { 74 rowValues = new List<List<object>>();75 75 variableNames = new List<string>(); 76 76 } … … 104 104 using (StreamReader reader = new StreamReader(stream)) { 105 105 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 106 return tokenizer.PeekType() != TokenTypeEnum.Double;106 return (tokenizer.PeekType() != TokenTypeEnum.Double); 107 107 } 108 108 } … … 118 118 char separator; 119 119 DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator); 120 EstimateNumberOfLines(fileName); 120 121 Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 121 122 } … … 130 131 /// <param name="columnNamesInFirstLine"></param> 131 132 public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) { 133 EstimateNumberOfLines(fileName); 132 134 using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { 133 135 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 136 } 137 } 138 139 // determines the number of newline characters in the first 64KB to guess the number of rows for a file 140 private void EstimateNumberOfLines(string fileName) { 141 var len = new System.IO.FileInfo(fileName).Length; 142 var buf = new char[64 * 1024]; 143 var reader = new StreamReader(File.OpenRead(fileName)); 144 reader.ReadBlock(buf, 0, buf.Length); 145 int numNewLine = 0; 146 foreach (var ch in buf) if (ch == '\n') numNewLine++; 147 if (numNewLine == 0) { 148 // fail -> keep the default setting 149 return; 150 } else { 151 double charsPerLineFactor = buf.Length / (double)numNewLine; 152 double estimatedLines = len / charsPerLineFactor; 153 estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough 134 154 } 135 155 } … … 158 178 using (StreamReader reader = new StreamReader(stream)) { 159 179 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 160 // parse the file 161 Parse(columnNamesInFirstLine, lineLimit); 162 } 163 164 // translate the list of samples into a DoubleMatrixData item 165 rows = rowValues.Count; 166 columns = rowValues[0].Count; 167 values = new List<IList>(); 168 169 //create columns 170 for (int col = 0; col < columns; col++) { 171 var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType()); 172 if (!types.Any()) { 173 values.Add(new List<string>()); 174 continue; 175 } 176 177 var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key; 178 if (columnType == typeof(double)) values.Add(new List<double>()); 179 else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>()); 180 else if (columnType == typeof(string)) values.Add(new List<string>()); 181 else throw new InvalidOperationException(); 182 } 183 184 185 186 //fill with values 187 foreach (List<object> row in rowValues) { 188 int columnIndex = 0; 189 foreach (object element in row) { 190 if (values[columnIndex] is List<double> && !(element is double)) 191 values[columnIndex].Add(double.NaN); 192 else if (values[columnIndex] is List<DateTime> && !(element is DateTime)) 193 values[columnIndex].Add(DateTime.MinValue); 194 else if (values[columnIndex] is List<string> && !(element is string)) 195 values[columnIndex].Add(element.ToString()); 196 else 197 values[columnIndex].Add(element); 198 columnIndex++; 199 } 180 // parse the file line by line 181 values = new List<IList>(); 182 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 183 foreach (var row in Parse(columnNamesInFirstLine, lineLimit)) { 184 columns = row.Count; 185 // on the first row we create our lists for column-oriented storage 186 if (!values.Any()) { 187 foreach (var obj in row) { 188 // create a list type matching the object type and add first element 189 if (obj == null) { 190 var l = new List<object>(estimatedNumberOfLines); 191 values.Add(l); 192 l.Add(obj); 193 } else if (obj is double) { 194 var l = new List<double>(estimatedNumberOfLines); 195 values.Add(l); 196 l.Add((double)obj); 197 } else if (obj is DateTime) { 198 var l = new List<DateTime>(estimatedNumberOfLines); 199 values.Add(l); 200 l.Add((DateTime)obj); 201 } else if (obj is string) { 202 var l = new List<string>(estimatedNumberOfLines); 203 values.Add(l); 204 l.Add((string)obj); 205 } else throw new InvalidOperationException(); 206 } 207 // fill with initial value 208 } else { 209 // the columns are already there -> try to add values 210 int columnIndex = 0; 211 foreach (object element in row) { 212 if (values[columnIndex] is List<double> && !(element is double)) 213 values[columnIndex].Add(double.NaN); 214 else if (values[columnIndex] is List<DateTime> && !(element is DateTime)) 215 values[columnIndex].Add(DateTime.MinValue); 216 else if (values[columnIndex] is List<string> && !(element is string)) 217 values[columnIndex].Add(element.ToString()); 218 else 219 values[columnIndex].Add(element); 220 columnIndex++; 221 } 222 } 223 } 224 225 if (!values.Any() || values.First().Count == 0) 226 Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 227 } 228 229 // after everything has been parsed make sure the lists are as compact as possible 230 foreach (var l in values) { 231 var dblList = l as List<double>; 232 var byteList = l as List<byte>; 233 var dateList = l as List<DateTime>; 234 var stringList = l as List<string>; 235 var objList = l as List<object>; 236 if (dblList != null) dblList.TrimExcess(); 237 if (byteList != null) byteList.TrimExcess(); 238 if (dateList != null) dateList.TrimExcess(); 239 if (stringList != null) stringList.TrimExcess(); 240 if (objList != null) objList.TrimExcess(); 200 241 } 201 242 } … … 315 356 } 316 357 317 318 358 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) { 319 359 this.reader = reader; … … 329 369 try { 330 370 BytesRead = reader.BaseStream.Position; 331 } catch (IOException) { 371 } 372 catch (IOException) { 332 373 BytesRead += CurrentLine.Length + 2; // guess 333 } catch (NotSupportedException) { 374 } 375 catch (NotSupportedException) { 334 376 BytesRead += CurrentLine.Length + 2; 335 377 } … … 413 455 dblVal = doubleVals[tokenPos]; 414 456 dateTimeVal = dateTimeVals[tokenPos]; 415 416 457 Skip(); 417 458 } … … 424 465 425 466 #region parsing 426 private voidParse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit467 private IEnumerable<List<object>> Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit 427 468 if (columnNamesInFirstLine) { 428 469 ParseVariableNames(); … … 432 473 "", tokenizer.CurrentLineNumber); 433 474 } 434 ParseValues(lineLimit); 435 if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 436 } 437 438 private void ParseValues(int lineLimit = -1) { 475 return ParseValues(lineLimit); 476 } 477 478 private IEnumerable<List<object>> ParseValues(int lineLimit = -1) { 439 479 int nLinesParsed = 0; 480 int numValuesInFirstRow = -1; 440 481 while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { 441 482 if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { … … 454 495 // all rows have to have the same number of values 455 496 // the first row defines how many samples are needed 456 if (rowValues.Count > 0 && rowValues[0].Count != row.Count) { 457 Error("The first row of the dataset has " + rowValues[0].Count + " columns." + 497 if (numValuesInFirstRow < 0) numValuesInFirstRow = row.Count; 498 else if (numValuesInFirstRow != row.Count) { 499 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + 458 500 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", 459 501 tokenizer.CurrentLineNumber); 460 502 } 461 rowValues.Add(row);503 yield return row; 462 504 } 463 505
Note: See TracChangeset
for help on using the changeset viewer.