- Timestamp:
- 09/12/11 13:48:31 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs
r5809 r6740 21 21 22 22 using System; 23 using System.Collections; 23 24 using System.Collections.Generic; 24 25 using System.Globalization; … … 33 34 private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' }; 34 35 private Tokenizer tokenizer; 35 private List<List< double>> rowValues;36 private List<List<object>> rowValues; 36 37 37 38 private int rows; … … 47 48 } 48 49 49 private double[,]values;50 public double[,]Values {50 private List<IList> values; 51 public List<IList> Values { 51 52 get { 52 53 return values; … … 69 70 70 71 public TableFileParser() { 71 rowValues = new List<List< double>>();72 rowValues = new List<List<object>>(); 72 73 variableNames = new List<string>(); 73 74 } … … 75 76 public void Parse(string fileName) { 76 77 NumberFormatInfo numberFormat; 78 DateTimeFormatInfo dateTimeFormatInfo; 77 79 char separator; 78 DetermineFileFormat(fileName, out numberFormat, out separator);80 DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator); 79 81 using (StreamReader reader = new StreamReader(fileName)) { 80 tokenizer = new Tokenizer(reader, numberFormat, separator);82 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 81 83 // parse the file 82 84 Parse(); … … 86 88 rows = rowValues.Count; 87 89 columns = rowValues[0].Count; 88 values = new double[rows, columns]; 89 90 int rowIndex = 0; 91 int columnIndex = 0; 92 foreach (List<double> row in rowValues) { 93 columnIndex = 0; 94 foreach (double element in row) { 95 values[rowIndex, columnIndex++] = element; 96 } 97 rowIndex++; 98 } 99 } 100 101 private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) { 90 values = new List<IList>(); 91 92 //create columns 93 for (int col = 0; col < columns; col++) { 94 var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType()); 95 if (!types.Any()) { 96 values.Add(new List<string>()); 97 continue; 98 } 99 100 var columnType = types.GroupBy(v => v).OrderBy(v => v).Last().Key; 101 if (columnType == typeof(double)) values.Add(new List<double>()); 102 else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>()); 103 else if (columnType == typeof(string)) values.Add(new List<string>()); 104 else throw new InvalidOperationException(); 105 } 106 107 108 109 //fill with values 110 foreach (List<object> row in rowValues) { 111 int columnIndex = 0; 112 foreach (object element in row) { 113 //handle missing values with default values 114 if (element as string == string.Empty) { 115 if (values[columnIndex] is List<double>) values[columnIndex].Add(double.NaN); 116 else if (values[columnIndex] is List<DateTime>) values[columnIndex].Add(DateTime.MinValue); 117 else if (values[columnIndex] is List<string>) values[columnIndex].Add(string.Empty); 118 else throw new InvalidOperationException(); 119 } else values[columnIndex].Add(element); 120 columnIndex++; 121 } 122 } 123 } 124 125 private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) { 102 126 using (StreamReader reader = new StreamReader(fileName)) { 103 127 // skip first line … … 123 147 if (OccurrencesOf(charCounts, '.') > 10) { 124 148 numberFormat = NumberFormatInfo.InvariantInfo; 149 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 125 150 separator = POSSIBLE_SEPARATORS 126 151 .Where(c => OccurrencesOf(charCounts, c) > 10) … … 139 164 // English format (only integer values) with ',' as separator 140 165 numberFormat = NumberFormatInfo.InvariantInfo; 166 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 141 167 separator = ','; 142 168 } else { … … 144 170 // German format (real values) 145 171 numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")); 172 dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")); 146 173 separator = POSSIBLE_SEPARATORS 147 174 .Except(disallowedSeparators) … … 154 181 // no points and no commas => English format 155 182 numberFormat = NumberFormatInfo.InvariantInfo; 183 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 156 184 separator = POSSIBLE_SEPARATORS 157 185 .Where(c => OccurrencesOf(charCounts, c) > 10) … … 169 197 #region tokenizer 170 198 internal enum TokenTypeEnum { 171 NewLine, Separator, String, Double 199 NewLine, Separator, String, Double, DateTime 172 200 } 173 201 … … 176 204 public string stringValue; 177 205 public double doubleValue; 206 public DateTime dateTimeValue; 178 207 179 208 public Token(TokenTypeEnum type, string value) { 180 209 this.type = type; 181 210 stringValue = value; 211 dateTimeValue = DateTime.MinValue; 182 212 doubleValue = 0.0; 183 213 } … … 193 223 private List<Token> tokens; 194 224 private NumberFormatInfo numberFormatInfo; 225 private DateTimeFormatInfo dateTimeFormatInfo; 195 226 private char separator; 196 227 private const string INTERNAL_SEPARATOR = "#"; … … 218 249 } 219 250 220 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {251 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) { 221 252 this.reader = reader; 222 253 this.numberFormatInfo = numberFormatInfo; 254 this.dateTimeFormatInfo = dateTimeFormatInfo; 223 255 this.separator = separator; 224 256 separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR); … … 264 296 token.type = TokenTypeEnum.Double; 265 297 return token; 266 } 267 268 // couldn't parse the token as an int or float number so return a string token 298 } else if (DateTime.TryParse(strToken, out token.dateTimeValue)) { 299 token.type = TokenTypeEnum.DateTime; 300 return token; 301 } 302 303 // couldn't parse the token as an int or float number or datetime value so return a string token 269 304 return token; 270 305 } … … 299 334 private void ParseValues() { 300 335 while (tokenizer.HasNext()) { 301 List<double> row = new List<double>(); 302 row.Add(NextValue(tokenizer)); 336 List<object> row = new List<object>(); 337 object value = NextValue(tokenizer); 338 if (value == null) { tokenizer.Next(); continue; } 339 row.Add(value); 303 340 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 304 341 Expect(tokenizer.SeparatorToken); … … 312 349 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); 313 350 } 314 // add the current row to the collection of rows and start a new row315 351 rowValues.Add(row); 316 row = new List<double>(); 317 } 318 } 319 320 private double NextValue(Tokenizer tokenizer) { 321 if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN; 352 row = new List<object>(); 353 } 354 } 355 356 private object NextValue(Tokenizer tokenizer) { 357 if (tokenizer.Peek() == tokenizer.SeparatorToken) return string.Empty; 358 if (tokenizer.Peek() == tokenizer.NewlineToken) return null; 322 359 Token current = tokenizer.Next(); 323 if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {360 if (current.type == TokenTypeEnum.Separator) { 324 361 return double.NaN; 362 } else if (current.type == TokenTypeEnum.String) { 363 return current.stringValue; 325 364 } else if (current.type == TokenTypeEnum.Double) { 326 // just take the value327 365 return current.doubleValue; 366 } else if (current.type == TokenTypeEnum.DateTime) { 367 return current.dateTimeValue; 328 368 } 329 369 // found an unexpected token => throw error … … 334 374 335 375 private void ParseVariableNames() { 336 // if the first line doesn't start with a double value then we assume that the 337 // first line contains variable names 338 if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) { 339 340 List<Token> tokens = new List<Token>(); 341 Token valueToken; 376 //if first token is double no variables names are given 377 if (tokenizer.Peek().type == TokenTypeEnum.Double) return; 378 379 // the first line must contain variable names 380 List<Token> tokens = new List<Token>(); 381 Token valueToken; 382 valueToken = tokenizer.Next(); 383 tokens.Add(valueToken); 384 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 385 Expect(tokenizer.SeparatorToken); 342 386 valueToken = tokenizer.Next(); 343 tokens.Add(valueToken);344 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {345 Expect(tokenizer.SeparatorToken);346 valueToken = tokenizer.Next();347 if (valueToken != tokenizer.NewlineToken) {348 tokens.Add(valueToken);349 }350 }351 387 if (valueToken != tokenizer.NewlineToken) { 352 Expect(tokenizer.NewlineToken); 353 } 354 variableNames = tokens.Select(x => x.stringValue.Trim()).ToList(); 355 } 388 tokens.Add(valueToken); 389 } 390 } 391 if (valueToken != tokenizer.NewlineToken) { 392 Expect(tokenizer.NewlineToken); 393 } 394 variableNames = tokens.Select(x => x.stringValue.Trim()).ToList(); 356 395 } 357 396
Note: See TracChangeset
for help on using the changeset viewer.