Changeset 2446 for trunk/sources
- Timestamp:
- 10/22/09 13:31:31 (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.DataAnalysis/3.2/DatasetParser.cs
r1529 r2446 24 24 using System.Globalization; 25 25 using System.IO; 26 using System.Linq; 26 27 using HeuristicLab.Data; 28 using System.Text; 27 29 28 30 namespace HeuristicLab.DataAnalysis { … … 223 225 #region tokenizer 224 226 internal enum TokenTypeEnum { 225 At, Assign, NewLine, S tring, Double, Int, WhiteSpace227 At, Assign, NewLine, SemiColon, String, Double, Int 226 228 } 227 229 … … 248 250 private StreamReader reader; 249 251 private List<Token> tokens; 250 private string[] separators = new string[] { "@", "=", ";", "\t" };251 252 private NumberFormatInfo numberFormatInfo; 252 253 … … 257 258 public static Token AtToken = new Token(TokenTypeEnum.At, "@"); 258 259 public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "="); 259 public static Token SeparatorToken = new Token(TokenTypeEnum.WhiteSpace, ""); 260 public string[] Separators { 261 get { return separators; } 262 set { separators = value; } 263 } 264 260 public static Token SeparatorToken = new Token(TokenTypeEnum.SemiColon, ";"); 265 261 266 262 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) { … … 274 270 if (!reader.EndOfStream) { 275 271 CurrentLine = reader.ReadLine(); 276 Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.None), delegate(string str) { 277 return MakeToken(str.Trim()); 278 }); 279 280 foreach (Token tok in newTokens) { 281 if (tok != SeparatorToken) tokens.Add(tok); 282 } 272 var newTokens = from str in Split(CurrentLine) 273 let trimmedStr = str.Trim() 274 where !string.IsNullOrEmpty(trimmedStr) 275 select MakeToken(trimmedStr.Trim()); 276 277 tokens.AddRange(newTokens); 283 278 tokens.Add(NewlineToken); 284 279 CurrentLineNumber++; … … 286 281 } 287 282 283 private IEnumerable<string> Split(string line) { 284 StringBuilder subStr = new StringBuilder(); 285 foreach (char c in line) { 286 if (c == '@' || c == '=' || c == ';') { 287 yield return subStr.ToString(); 288 subStr = new StringBuilder(); 289 yield return c.ToString(); 290 } else { 291 subStr.Append(c); 292 } 293 } 294 yield return subStr.ToString(); 295 } 296 288 297 private Token MakeToken(string strToken) { 289 298 Token token = new Token(TokenTypeEnum.String, strToken); 290 291 // try to parse as a number first 292 if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) { 299 if (strToken.Equals(AtToken.stringValue)) { 300 return AtToken; 301 } else if (strToken.Equals(AssignmentToken.stringValue)) { 302 return AssignmentToken; 303 } else if (strToken.Equals(SeparatorToken.stringValue)) { 304 return SeparatorToken; 305 } else if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) { 293 306 token.type = TokenTypeEnum.Int; 294 307 return token; … … 296 309 token.type = TokenTypeEnum.Double; 297 310 return token; 298 } else if (String.IsNullOrEmpty(strToken)) { 299 token.type = TokenTypeEnum.WhiteSpace; 300 return token; 301 } 311 } 312 302 313 // couldn't parse the token as an int or float number so return a string token 303 314 return token; … … 332 343 333 344 private void ParseSampleData(bool strict) { 334 List<double> row = new List<double>();335 345 while (tokenizer.HasNext()) { 336 Token current = tokenizer.Next(); 337 if (current.type == TokenTypeEnum.WhiteSpace) { 338 row.Add(double.NaN); 339 } else if (current.type == TokenTypeEnum.Double) { 340 // just take the value 341 row.Add(current.doubleValue); 342 } else if (current.type == TokenTypeEnum.Int) { 343 // translate the int value to double 344 row.Add((double)current.intValue); 345 } else if (current == Tokenizer.NewlineToken) { 346 // when parsing strictly all rows have to have the same number of values 347 if (strict) { 348 // the first row defines how many samples are needed 349 if (samplesList.Count > 0 && samplesList[0].Count != row.Count) { 350 Error("The first row of the dataset has " + samplesList[0].Count + " columns." + 351 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); 346 List<double> row = new List<double>(); 347 row.Add(NextValue(tokenizer, strict)); 348 while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) { 349 Expect(Tokenizer.SeparatorToken); 350 row.Add(NextValue(tokenizer, strict)); 351 } 352 Expect(Tokenizer.NewlineToken); 353 // when parsing strictly all rows have to have the same number of values 354 if (strict) { 355 // the first row defines how many samples are needed 356 if (samplesList.Count > 0 && samplesList[0].Count != row.Count) { 357 Error("The first row of the dataset has " + samplesList[0].Count + " columns." + 358 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); 359 } 360 } else if (samplesList.Count > 0) { 361 // when we are not strict then fill or drop elements as needed 362 if (samplesList[0].Count > row.Count) { 363 // fill with NAN 364 for (int i = row.Count; i < samplesList[0].Count; i++) { 365 row.Add(double.NaN); 352 366 } 353 } else if (samplesList.Count > 0) { 354 // when we are not strict then fill or drop elements as needed 355 if (samplesList[0].Count > row.Count) { 356 // fill with NAN 357 for (int i = row.Count; i < samplesList[0].Count; i++) { 358 row.Add(double.NaN); 359 } 360 } else if (samplesList[0].Count < row.Count) { 361 // drop last k elements where k = n - length of first row 362 row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count); 363 } 364 } 365 366 // add the current row to the collection of rows and start a new row 367 samplesList.Add(row); 368 row = new List<double>(); 367 } else if (samplesList[0].Count < row.Count) { 368 // drop last k elements where k = n - length of first row 369 row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count); 370 } 371 } 372 373 // add the current row to the collection of rows and start a new row 374 samplesList.Add(row); 375 row = new List<double>(); 376 } 377 } 378 379 private double NextValue(Tokenizer tokenizer, bool strict) { 380 if (tokenizer.Peek() == Tokenizer.SeparatorToken || tokenizer.Peek() == Tokenizer.NewlineToken) return double.NaN; 381 Token current = tokenizer.Next(); 382 if (current.type == TokenTypeEnum.SemiColon || current.type == TokenTypeEnum.String) { 383 return double.NaN; 384 } else if (current.type == TokenTypeEnum.Double) { 385 // just take the value 386 return current.doubleValue; 387 } else if (current.type == TokenTypeEnum.Int) { 388 // translate the int value to double 389 return (double)current.intValue; 390 } else { 391 // found an unexpected token => throw error when parsing strictly 392 // when we are parsing non-strictly we also allow unreadable values inserting NAN instead 393 if (strict) { 394 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber); 369 395 } else { 370 // found an unexpected token => return false when parsing strictly 371 // when we are parsing non-strictly we also allow unreadable values inserting NAN instead 372 if (strict) { 373 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber); 374 } else { 375 row.Add(double.NaN); 376 } 377 } 378 } 396 return double.NaN; 397 } 398 } 399 return double.NaN; 379 400 } 380 401 381 402 private void ParseMetaData(bool strict) { 382 while (tokenizer.HasNext() && (tokenizer.Peek().type == TokenTypeEnum.WhiteSpace || tokenizer.Peek().type == TokenTypeEnum.String)) { 383 while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) tokenizer.Next(); 403 while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.AtToken) { 404 Expect(Tokenizer.AtToken); 405 384 406 Token nameToken = tokenizer.Next(); 385 if (nameToken.type != TokenTypeEnum.String) 386 Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber); 407 Expect(Tokenizer.AssignmentToken); 387 408 388 409 List<Token> tokens = new List<Token>(); 389 410 Token valueToken; 390 while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) valueToken = tokenizer.Next();391 411 valueToken = tokenizer.Next(); 392 while (valueToken != Tokenizer.NewlineToken) {393 tokens.Add(valueToken);394 while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) tokenizer.Next();412 tokens.Add(valueToken); 413 while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) { 414 Expect(Tokenizer.SeparatorToken); 395 415 valueToken = tokenizer.Next(); 396 } 397 416 if (valueToken != Tokenizer.NewlineToken) { 417 tokens.Add(valueToken); 418 } 419 } 420 if (valueToken != Tokenizer.NewlineToken) { 421 Expect(Tokenizer.NewlineToken); 422 } 398 423 metadata[nameToken.stringValue] = tokens; 399 424 }
Note: See TracChangeset
for help on using the changeset viewer.