Changeset 1221
- Timestamp:
- 02/23/09 15:33:07 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs
r620 r1221 65 65 public string ProblemName { 66 66 get { 67 if (metadata.ContainsKey(PROBLEMNAME)) {67 if (metadata.ContainsKey(PROBLEMNAME)) { 68 68 return metadata[PROBLEMNAME][0].stringValue; 69 69 } else return "-"; … … 73 73 public string[] VariableNames { 74 74 get { 75 if (metadata.ContainsKey(VARIABLENAMES)) {75 if (metadata.ContainsKey(VARIABLENAMES)) { 76 76 List<Token> nameList = metadata[VARIABLENAMES]; 77 77 string[] names = new string[nameList.Count]; 78 for (int i = 0; i < names.Length; i++) {78 for (int i = 0; i < names.Length; i++) { 79 79 names[i] = nameList[i].stringValue; 80 80 } … … 82 82 } else { 83 83 string[] names = new string[columns]; 84 for (int i = 0; i < names.Length; i++) {84 for (int i = 0; i < names.Length; i++) { 85 85 names[i] = "X" + i.ToString("000"); 86 86 } … … 92 92 public int TargetVariable { 93 93 get { 94 if (metadata.ContainsKey(TARGETVARIABLE)) {94 if (metadata.ContainsKey(TARGETVARIABLE)) { 95 95 return metadata[TARGETVARIABLE][0].intValue; 96 96 } else return 0; // default is the first column … … 100 100 public int MaxTreeHeight { 101 101 get { 102 if (metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {102 if (metadata.ContainsKey(MAXIMUMTREEHEIGHT)) { 103 103 return metadata[MAXIMUMTREEHEIGHT][0].intValue; 104 104 } else return 0; … … 108 108 public int MaxTreeSize { 109 109 get { 110 if (metadata.ContainsKey(MAXIMUMTREESIZE)) {110 if (metadata.ContainsKey(MAXIMUMTREESIZE)) { 111 111 return metadata[MAXIMUMTREESIZE][0].intValue; 112 112 } else return 0; … … 116 116 public int TrainingSamplesStart { 117 117 get { 118 if (metadata.ContainsKey(TRAININGSAMPLESSTART)) {118 if (metadata.ContainsKey(TRAININGSAMPLESSTART)) { 119 119 return metadata[TRAININGSAMPLESSTART][0].intValue; 120 120 } else return 0; … … 124 124 public int TrainingSamplesEnd { 125 125 get { 126 if (metadata.ContainsKey(TRAININGSAMPLESEND)) {126 if (metadata.ContainsKey(TRAININGSAMPLESEND)) { 127 127 return metadata[TRAININGSAMPLESEND][0].intValue; 128 128 } else return rows; … … 131 131 public int ValidationSamplesStart { 132 132 get { 133 if (metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {133 if (metadata.ContainsKey(VALIDATIONSAMPLESSTART)) { 134 134 return metadata[VALIDATIONSAMPLESSTART][0].intValue; 135 135 } else return 0; … … 139 139 public int ValidationSamplesEnd { 140 140 get { 141 if (metadata.ContainsKey(VALIDATIONSAMPLESEND)) {141 if (metadata.ContainsKey(VALIDATIONSAMPLESEND)) { 142 142 return metadata[VALIDATIONSAMPLESEND][0].intValue; 143 143 } else return rows; … … 146 146 public int TestSamplesStart { 147 147 get { 148 if (metadata.ContainsKey(TESTSAMPLESSTART)) {148 if (metadata.ContainsKey(TESTSAMPLESSTART)) { 149 149 return metadata[TESTSAMPLESSTART][0].intValue; 150 150 } else return 0; … … 154 154 public int TestSamplesEnd { 155 155 get { 156 if (metadata.ContainsKey(TESTSAMPLESEND)) {156 if (metadata.ContainsKey(TESTSAMPLESEND)) { 157 157 return metadata[TESTSAMPLESEND][0].intValue; 158 158 } else return rows; … … 163 163 get { 164 164 List<int> disallowedVariables = new List<int>(); 165 if (metadata.ContainsKey(NONINPUTVARIABLES)) {166 foreach (Token t in metadata[NONINPUTVARIABLES]) {165 if (metadata.ContainsKey(NONINPUTVARIABLES)) { 166 foreach (Token t in metadata[NONINPUTVARIABLES]) { 167 167 disallowedVariables.Add(t.intValue); 168 168 } … … 191 191 int i = 0; 192 192 int j = 0; 193 foreach (List<double> row in samplesList) {193 foreach (List<double> row in samplesList) { 194 194 j = 0; 195 foreach (double element in row) {195 foreach (double element in row) { 196 196 samples[i * columns + j] = element; 197 197 j++; … … 204 204 Exception lastEx = null; 205 205 NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo, CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo }; 206 foreach (NumberFormatInfo numberFormat in possibleFormats) {207 using (StreamReader reader = new StreamReader(importFileName)) {206 foreach (NumberFormatInfo numberFormat in possibleFormats) { 207 using (StreamReader reader = new StreamReader(importFileName)) { 208 208 tokenizer = new Tokenizer(reader, numberFormat); 209 209 try { … … 211 211 Parse(strict); 212 212 return; // parsed without errors -> return; 213 } catch(DataFormatException ex) { 213 } 214 catch (DataFormatException ex) { 214 215 lastEx = ex; 215 216 } … … 271 272 272 273 private void ReadNextTokens() { 273 if (!reader.EndOfStream) {274 if (!reader.EndOfStream) { 274 275 CurrentLine = reader.ReadLine(); 275 Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions. RemoveEmptyEntries), delegate(string str) {276 Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.None), delegate(string str) { 276 277 return MakeToken(str.Trim()); 277 278 }); 278 279 279 foreach (Token tok in newTokens) {280 if (tok != SeparatorToken) tokens.Add(tok);280 foreach (Token tok in newTokens) { 281 if (tok != SeparatorToken) tokens.Add(tok); 281 282 } 282 283 tokens.Add(NewlineToken); … … 289 290 290 291 // try to parse as a number first 291 if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) {292 if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) { 292 293 token.type = TokenTypeEnum.Int; 293 294 return token; 294 } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {295 } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) { 295 296 token.type = TokenTypeEnum.Double; 297 return token; 298 } else if (String.IsNullOrEmpty(strToken)) { 299 token.type = TokenTypeEnum.WhiteSpace; 296 300 return token; 297 301 } … … 307 311 Token next = tokens[0]; 308 312 tokens.RemoveAt(0); 309 if (tokens.Count == 0) {313 if (tokens.Count == 0) { 310 314 ReadNextTokens(); 311 315 } … … 322 326 private void Parse(bool strict) { 323 327 ParseMetaData(strict); 324 if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);328 if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 325 329 ParseSampleData(strict); 326 if (samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);330 if (samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 327 331 } 328 332 329 333 private void ParseSampleData(bool strict) { 330 334 List<double> row = new List<double>(); 331 while (tokenizer.HasNext()) {335 while (tokenizer.HasNext()) { 332 336 Token current = tokenizer.Next(); 333 if(current.type == TokenTypeEnum.Double) { 337 if (current.type == TokenTypeEnum.WhiteSpace) { 338 row.Add(double.NaN); 339 } else if (current.type == TokenTypeEnum.Double) { 334 340 // just take the value 335 341 row.Add(current.doubleValue); 336 } else if (current.type == TokenTypeEnum.Int) {342 } else if (current.type == TokenTypeEnum.Int) { 337 343 // translate the int value to double 338 344 row.Add((double)current.intValue); 339 } else if (current == Tokenizer.NewlineToken) {345 } else if (current == Tokenizer.NewlineToken) { 340 346 // when parsing strictly all rows have to have the same number of values 341 if (strict) {347 if (strict) { 342 348 // the first row defines how many samples are needed 343 if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {349 if (samplesList.Count > 0 && samplesList[0].Count != row.Count) { 344 350 Error("The first row of the dataset has " + samplesList[0].Count + " columns." + 345 351 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); 346 352 } 347 } else if (samplesList.Count > 0) {353 } else if (samplesList.Count > 0) { 348 354 // when we are not strict then fill or drop elements as needed 349 if (samplesList[0].Count > row.Count) {355 if (samplesList[0].Count > row.Count) { 350 356 // fill with NAN 351 for (int i = row.Count; i < samplesList[0].Count; i++) {357 for (int i = row.Count; i < samplesList[0].Count; i++) { 352 358 row.Add(double.NaN); 353 359 } 354 } else if (samplesList[0].Count < row.Count) {360 } else if (samplesList[0].Count < row.Count) { 355 361 // drop last k elements where k = n - length of first row 356 362 row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count); … … 364 370 // found an unexpected token => return false when parsing strictly 365 371 // when we are parsing non-strictly we also allow unreadable values inserting NAN instead 366 if (strict) {372 if (strict) { 367 373 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber); 368 374 } else { … … 374 380 375 381 private void ParseMetaData(bool strict) { 376 while(tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.String) { 382 while (tokenizer.HasNext() && (tokenizer.Peek().type == TokenTypeEnum.WhiteSpace || tokenizer.Peek().type == TokenTypeEnum.String)) { 383 while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) tokenizer.Next(); 377 384 Token nameToken = tokenizer.Next(); 378 if (nameToken.type != TokenTypeEnum.String)385 if (nameToken.type != TokenTypeEnum.String) 379 386 Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber); 380 387 381 388 List<Token> tokens = new List<Token>(); 382 Token valueToken = tokenizer.Next(); 383 while(valueToken != Tokenizer.NewlineToken) { 389 Token valueToken; 390 while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) valueToken = tokenizer.Next(); 391 valueToken = tokenizer.Next(); 392 while (valueToken != Tokenizer.NewlineToken) { 384 393 tokens.Add(valueToken); 394 while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) tokenizer.Next(); 385 395 valueToken = tokenizer.Next(); 386 396 } … … 392 402 private void Expect(Token expectedToken) { 393 403 Token actualToken = tokenizer.Next(); 394 if (actualToken != expectedToken) {404 if (actualToken != expectedToken) { 395 405 Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber); 396 406 }
Note: See TracChangeset
for help on using the changeset viewer.