- Timestamp:
- 01/16/16 15:22:52 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r13447 r13526 24 24 using System.Collections; 25 25 using System.Collections.Generic; 26 using System.Diagnostics.Contracts; 26 27 using System.Globalization; 27 28 using System.IO; … … 141 142 private void EstimateNumberOfLines(string fileName) { 142 143 var len = new System.IO.FileInfo(fileName).Length; 143 var buf = new char[ 64 * 1024];144 var buf = new char[1024 * 1024]; 144 145 using (var reader = new StreamReader(fileName)) { 145 146 reader.ReadBlock(buf, 0, buf.Length); … … 233 234 tokenizer.CurrentLineNumber); 234 235 } 236 if (!IsColumnTypeCompatible(values[colIdx], type)) { 237 values[colIdx] = ConvertToStringColumn(values[colIdx]); 238 } 235 239 // add the value to the column 236 240 AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal); … … 265 269 266 270 #region type-dependent dispatch 271 private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) { 272 return (list is List<string>) || // all tokens can be added to a string list 273 (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns 274 (tokenType == TokenTypeEnum.Double && list is List<double>) || 275 (tokenType == TokenTypeEnum.DateTime && list is List<DateTime>); 276 } 277 278 // all columns are converted to string columns when we find an non-empty value that has incorrect type 279 private IList ConvertToStringColumn(IList list) { 280 var dblL = list as List<double>; 281 if (dblL != null) { 282 var l = new List<string>(dblL.Capacity); 283 l.AddRange(dblL.Select(dbl => dbl.ToString())); 284 return l; 285 } 286 287 var dtL = list as List<DateTime>; 288 if (dtL != null) { 289 var l = new List<string>(dtL.Capacity); 290 l.AddRange(dtL.Select(dbl => dbl.ToString())); 291 return l; 292 } 293 294 if (list is List<string>) return list; 295 296 throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType())); 297 } 298 267 299 private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) { 268 switch (type) {269 case TokenTypeEnum.Double:270 AddDoubleToList(list, dblVal);271 break;272 case TokenTypeEnum.String:273 AddStringToList(list, strVal);274 break;275 case TokenTypeEnum.DateTime:276 AddDateTimeToList(list, dateTimeVal);277 break;278 default:279 throw new InvalidOperationException();280 }281 }282 283 private void AddDoubleToList(IList list, double dblVal) {284 300 var dblList = list as List<double>; 285 if (dblList != null) dblList.Add(dblVal); 286 else { 287 var strList = list as List<string>; 288 if (strList != null) strList.Add(dblVal.ToString()); 289 else list.Add(null); 290 } 291 } 292 293 private void AddStringToList(IList list, string strVal) { 301 if (dblList != null) { 302 AddValue(type, dblList, dblVal); 303 return; 304 } 305 294 306 var strList = list as List<string>; 295 if (strList != null) strList.Add(strVal); 296 else { 297 var dblList = list as List<double>; 298 if (dblList != null) dblList.Add(double.NaN); 299 else list.Add(null); 300 } 301 } 302 303 private void AddDateTimeToList(IList list, DateTime dateTimeVal) { 304 var dateTimeList = list as List<DateTime>; 305 if (dateTimeList != null) dateTimeList.Add(dateTimeVal); 306 else { 307 var dblList = list as List<double>; 308 if (dblList != null) dblList.Add(double.NaN); 309 else { 310 var strList = list as List<string>; 311 if (strList != null) strList.Add(dateTimeVal.ToString()); 312 else list.Add(null); 313 } 314 } 307 if (strList != null) { 308 AddValue(type, strList, strVal); 309 return; 310 } 311 var dtList = list as List<DateTime>; 312 if (dtList != null) { 313 AddValue(type, dtList, dateTimeVal); 314 return; 315 } 316 317 list.Add(strVal); // assumes List<object> 318 } 319 320 private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) { 321 Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double); 322 list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal); 323 } 324 325 private void AddValue(TokenTypeEnum type, List<string> list, string strVal) { 326 // assumes that strVal is always set to the original token read from the input file 327 list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal); 328 } 329 330 private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) { 331 Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime); 332 list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal); 315 333 } 316 334 … … 320 338 return new List<string>(estimatedNumberOfLines); 321 339 case TokenTypeEnum.Double: 340 case TokenTypeEnum.Missing: // assume double columns 322 341 return new List<double>(estimatedNumberOfLines); 323 342 case TokenTypeEnum.DateTime: … … 381 400 separator = ','; 382 401 } else { 383 char[] disallowedSeparators = new char[] { ',' }; 402 char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail 384 403 // German format (real values) 385 404 numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")); … … 389 408 .Where(c => OccurrencesOf(charCounts, c) > 10) 390 409 .OrderBy(c => -OccurrencesOf(charCounts, c)) 391 .DefaultIfEmpty(' ') 410 .DefaultIfEmpty(' ') 392 411 .First(); 393 412 } … … 412 431 // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character 413 432 internal enum TokenTypeEnum { 414 NewLine, String, Double, DateTime 433 NewLine, String, Double, DateTime, Missing 415 434 } 416 435 … … 505 524 type = TokenTypeEnum.DateTime; 506 525 dateTimeVals[i] = dateTimeValue; 526 } else if (string.IsNullOrWhiteSpace(tok)) { 527 type = TokenTypeEnum.Missing; 507 528 } 508 529
Note: See TracChangeset
for help on using the changeset viewer.