- Timestamp:
- 05/29/08 17:22:10 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs
r173 r272 61 61 List<Token> nameList = metadata["VARIABLENAMES"]; 62 62 string[] names = new string[nameList.Count]; 63 for 63 for(int i = 0; i < names.Length; i++) { 64 64 names[i] = nameList[i].stringValue; 65 65 } … … 111 111 tokenizer.Separators = new string[] { " ", ";", "\t" }; 112 112 113 // parse the file 114 Parse(strict); 113 try { 114 // parse the file 115 Parse(strict); 116 } finally { 117 reader.Close(); 118 } 115 119 116 120 // translate the list of samples into a DoubleMatrixData item … … 121 125 int i = 0; 122 126 int j = 0; 123 foreach 127 foreach(List<double> row in samplesList) { 124 128 j = 0; 125 foreach 129 foreach(double element in row) { 126 130 samples[i * columns + j] = element; 127 131 j++; … … 180 184 181 185 private void ReadNextTokens() { 182 if 186 if(!reader.EndOfStream) { 183 187 CurrentLine = reader.ReadLine(); 184 188 Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) { … … 193 197 194 198 private Token MakeToken(string strToken) { 195 if 199 if(strToken == "@") 196 200 return AtToken; 197 else if 201 else if(strToken == "=") 198 202 return AssignmentToken; 199 203 else { … … 202 206 // try invariant culture 203 207 NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat; 204 if 208 if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) { 205 209 token.type = TokenTypeEnum.Int; 206 210 return token; 207 } else if 211 } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) { 208 212 token.type = TokenTypeEnum.Double; 209 213 return token; … … 211 215 // try german culture 212 216 currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat; 213 if 217 if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) { 214 218 token.type = TokenTypeEnum.Int; 215 219 return token; 216 } else if 220 } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) { 217 221 token.type = TokenTypeEnum.Double; 218 222 return token; … … 221 225 // try current culture 222 226 currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat; 223 if 227 if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) { 224 228 token.type = TokenTypeEnum.Int; 225 229 return token; 226 } else if 230 } else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) { 227 231 token.type = TokenTypeEnum.Double; 228 232 return token; … … 241 245 Token next = tokens[0]; 242 246 tokens.RemoveAt(0); 243 if 247 if(tokens.Count == 0) { 244 248 ReadNextTokens(); 245 249 } … … 261 265 private void ParseSampleData(bool strict) { 262 266 List<double> row = new List<double>(); 263 while 267 while(tokenizer.HasNext()) { 264 268 Token current = tokenizer.Next(); 265 if 269 if(current.type == TokenTypeEnum.Double) { 266 270 // just take the value 267 271 row.Add(current.doubleValue); 268 } else if 272 } else if(current.type == TokenTypeEnum.Int) { 269 273 // translate the int value to double 270 274 row.Add((double)current.intValue); 271 } else if 275 } else if(current == Tokenizer.NewlineToken) { 272 276 // when parsing strictly all rows have to have the same number of values 273 if 277 if(strict) { 274 278 // the first row defines how many samples are needed 275 if 279 if(samplesList.Count > 0 && samplesList[0].Count != row.Count) { 276 280 Error("The first row of the dataset has " + samplesList[0].Count + " columns." + 277 281 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns."); 278 282 } 279 } else if 283 } else if(samplesList.Count > 0) { 280 284 // when we are not strict then fill or drop elements as needed 281 if 285 if(samplesList[0].Count > row.Count) { 282 286 // fill with NAN 283 for 287 for(int i = row.Count; i < samplesList[0].Count; i++) { 284 288 row.Add(double.NaN); 285 289 } 286 } else if 290 } else if(samplesList[0].Count < row.Count) { 287 291 // drop last k elements where k = n - length of first row 288 292 row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count); … … 296 300 // found an unexpected token => return false when parsing strictly 297 301 // when we are parsing non-strictly we also allow unreadable values inserting NAN instead 298 if 302 if(strict) { 299 303 Error("Unkown value " + current + " in line " + tokenizer.CurrentLineNumber + 300 304 "\n" + tokenizer.CurrentLine); … … 307 311 308 312 private void ParseMetaData(bool strict) { 309 while 313 while(tokenizer.Peek() == Tokenizer.AtToken) { 310 314 Expect(Tokenizer.AtToken); 311 315 312 316 Token nameToken = tokenizer.Next(); 313 if 317 if(nameToken.type != TokenTypeEnum.String) 314 318 throw new Exception("Expected a variable name; got " + nameToken + 315 319 "\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine); … … 319 323 List<Token> tokens = new List<Token>(); 320 324 Token valueToken = tokenizer.Next(); 321 while 325 while(valueToken != Tokenizer.NewlineToken) { 322 326 tokens.Add(valueToken); 323 327 valueToken = tokenizer.Next(); … … 330 334 private void Expect(Token expectedToken) { 331 335 Token actualToken = tokenizer.Next(); 332 if 336 if(actualToken != expectedToken) { 333 337 Error("Expected: " + expectedToken + " got: " + actualToken + 334 338 "\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
Note: See TracChangeset
for help on using the changeset viewer.