- Timestamp:
- 04/04/10 18:53:55 (14 years ago)
- File:
-
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/CsvFileParser.cs
r3262 r3264 1 1 #region License Information 2 2 /* HeuristicLab 3 * Copyright (C) 2002-20 08Heuristic and Evolutionary Algorithms Laboratory (HEAL)3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 4 * 5 5 * This file is part of HeuristicLab. … … 28 28 using System.Text; 29 29 30 namespace HeuristicLab.DataAnalysis { 31 public class DatasetParser { 32 private const string PROBLEMNAME = "PROBLEMNAME"; 30 namespace HeuristicLab.Problems.DataAnalysis.Regression { 31 public class CsvFileParser { 33 32 private const string VARIABLENAMES = "VARIABLENAMES"; 34 private const string TARGETVARIABLE = "TARGETVARIABLE";35 private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";36 private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";37 private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";38 private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";39 private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";40 private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";41 private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";42 private const string TESTSAMPLESEND = "TESTSAMPLESEND";43 private const string NONINPUTVARIABLES = "NONINPUTVARIABLES";44 33 private Tokenizer tokenizer; 45 private Dictionary<string, List<Token>> metadata;46 private List<List<double>> samplesList;34 private List<string> variableNames; 35 private List<List<double>> rowValues; 47 36 48 37 private int rows; … … 58 47 } 59 48 60 private double[ ] samples;61 public double[ ] Samples {49 private double[,] values; 50 public double[,] Values { 62 51 get { 63 return samples;64 } 65 } 66 67 public string ProblemName{52 return values; 53 } 54 } 55 56 public IEnumerable<string> VariableNames { 68 57 get { 69 if (metadata.ContainsKey(PROBLEMNAME)) { 70 return metadata[PROBLEMNAME][0].stringValue; 71 } else return "-"; 72 } 73 } 74 75 public string[] VariableNames { 76 get { 77 if (metadata.ContainsKey(VARIABLENAMES)) { 78 List<Token> nameList = metadata[VARIABLENAMES]; 79 string[] names = new string[nameList.Count]; 80 for (int i = 0; i < names.Length; i++) { 81 names[i] = nameList[i].stringValue; 82 } 83 return names; 84 } else { 58 if (variableNames.Count > 0) return variableNames; 59 else { 85 60 string[] names = new string[columns]; 86 61 for (int i = 0; i < names.Length; i++) { … … 92 67 } 93 68 94 public int TargetVariable { 95 get { 96 if (metadata.ContainsKey(TARGETVARIABLE)) { 97 return metadata[TARGETVARIABLE][0].intValue; 98 } else return 0; // default is the first column 99 } 100 } 101 102 public int MaxTreeHeight { 103 get { 104 if (metadata.ContainsKey(MAXIMUMTREEHEIGHT)) { 105 return metadata[MAXIMUMTREEHEIGHT][0].intValue; 106 } else return 0; 107 } 108 } 109 110 public int MaxTreeSize { 111 get { 112 if (metadata.ContainsKey(MAXIMUMTREESIZE)) { 113 return metadata[MAXIMUMTREESIZE][0].intValue; 114 } else return 0; 115 } 116 } 117 118 public int TrainingSamplesStart { 119 get { 120 if (metadata.ContainsKey(TRAININGSAMPLESSTART)) { 121 return metadata[TRAININGSAMPLESSTART][0].intValue; 122 } else return 0; 123 } 124 } 125 126 public int TrainingSamplesEnd { 127 get { 128 if (metadata.ContainsKey(TRAININGSAMPLESEND)) { 129 return metadata[TRAININGSAMPLESEND][0].intValue; 130 } else return rows; 131 } 132 } 133 public int ValidationSamplesStart { 134 get { 135 if (metadata.ContainsKey(VALIDATIONSAMPLESSTART)) { 136 return metadata[VALIDATIONSAMPLESSTART][0].intValue; 137 } else return 0; 138 } 139 } 140 141 public int ValidationSamplesEnd { 142 get { 143 if (metadata.ContainsKey(VALIDATIONSAMPLESEND)) { 144 return metadata[VALIDATIONSAMPLESEND][0].intValue; 145 } else return rows; 146 } 147 } 148 public int TestSamplesStart { 149 get { 150 if (metadata.ContainsKey(TESTSAMPLESSTART)) { 151 return metadata[TESTSAMPLESSTART][0].intValue; 152 } else return 0; 153 } 154 } 155 156 public int TestSamplesEnd { 157 get { 158 if (metadata.ContainsKey(TESTSAMPLESEND)) { 159 return metadata[TESTSAMPLESEND][0].intValue; 160 } else return rows; 161 } 162 } 163 164 public List<int> NonInputVariables { 165 get { 166 List<int> disallowedVariables = new List<int>(); 167 if (metadata.ContainsKey(NONINPUTVARIABLES)) { 168 foreach (Token t in metadata[NONINPUTVARIABLES]) { 169 disallowedVariables.Add(t.intValue); 170 } 171 } 172 return disallowedVariables; 173 } 174 } 175 176 public DatasetParser() { 177 this.metadata = new Dictionary<string, List<Token>>(); 178 samplesList = new List<List<double>>(); 179 } 180 181 public void Reset() { 182 metadata.Clear(); 183 samplesList.Clear(); 184 } 185 186 public void Import(string importFileName, bool strict) { 187 TryParse(importFileName, strict); 69 public CsvFileParser() { 70 rowValues = new List<List<double>>(); 71 variableNames = new List<string>(); 72 } 73 74 private void Reset() { 75 variableNames.Clear(); 76 rowValues.Clear(); 77 } 78 79 public void Parse(string fileName) { 80 TryParse(fileName); 188 81 // translate the list of samples into a DoubleMatrixData item 189 samples = new double[samplesList.Count * samplesList[0].Count];190 rows = samplesList.Count;191 columns = samplesList[0].Count;192 193 int i= 0;194 int j= 0;195 foreach (List<double> row in samplesList) {196 j= 0;82 rows = rowValues.Count; 83 columns = rowValues[0].Count; 84 values = new double[rows, columns]; 85 86 int rowIndex = 0; 87 int columnIndex = 0; 88 foreach (List<double> row in rowValues) { 89 columnIndex = 0; 197 90 foreach (double element in row) { 198 samples[i * columns + j] = element; 199 j++; 200 } 201 i++; 202 } 203 } 204 205 private void TryParse(string importFileName, bool strict) { 91 values[rowIndex, columnIndex++] = element; 92 } 93 rowIndex++; 94 } 95 } 96 97 private void TryParse(string fileName) { 206 98 Exception lastEx = null; 207 NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo , CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo};99 NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo }; 208 100 foreach (NumberFormatInfo numberFormat in possibleFormats) { 209 using (StreamReader reader = new StreamReader( importFileName)) {101 using (StreamReader reader = new StreamReader(fileName)) { 210 102 tokenizer = new Tokenizer(reader, numberFormat); 211 103 try { 212 104 // parse the file 213 Parse( strict);105 Parse(); 214 106 return; // parsed without errors -> return; 215 107 } … … 225 117 #region tokenizer 226 118 internal enum TokenTypeEnum { 227 At, Assign, NewLine, SemiColon, String, Double, Int119 NewLine, Separator, String, Double 228 120 } 229 121 … … 232 124 public string stringValue; 233 125 public double doubleValue; 234 public int intValue;235 126 236 127 public Token(TokenTypeEnum type, string value) { … … 238 129 stringValue = value; 239 130 doubleValue = 0.0; 240 intValue = 0;241 131 } 242 132 … … 247 137 248 138 249 class Tokenizer {139 internal class Tokenizer { 250 140 private StreamReader reader; 251 141 private List<Token> tokens; 252 142 private NumberFormatInfo numberFormatInfo; 253 143 254 public int CurrentLineNumber = 0; 255 public string CurrentLine; 256 257 public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n"); 258 public static Token AtToken = new Token(TokenTypeEnum.At, "@"); 259 public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "="); 260 public static Token SeparatorToken = new Token(TokenTypeEnum.SemiColon, ";"); 261 262 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) { 144 private int currentLineNumber = 0; 145 public int CurrentLineNumber { 146 get { return currentLineNumber; } 147 private set { currentLineNumber = value; } 148 } 149 private string currentLine; 150 public string CurrentLine { 151 get { return currentLine; } 152 private set { currentLine = value; } 153 } 154 155 private Token newlineToken; 156 public Token NewlineToken { 157 get { return newlineToken; } 158 private set { newlineToken = value; } 159 } 160 private Token separatorToken; 161 public Token SeparatorToken { 162 get { return separatorToken; } 163 private set { separatorToken = value; } 164 } 165 166 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) { 263 167 this.reader = reader; 264 168 this.numberFormatInfo = numberFormatInfo; 169 separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString()); 170 newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine); 265 171 tokens = new List<Token>(); 266 172 ReadNextTokens(); 173 } 174 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) 175 : this(reader, numberFormatInfo, ';') { 267 176 } 268 177 … … 284 193 StringBuilder subStr = new StringBuilder(); 285 194 foreach (char c in line) { 286 if (c == ' @' || c == '=' || c == ';') {195 if (c == ';') { 287 196 yield return subStr.ToString(); 288 197 subStr = new StringBuilder(); … … 297 206 private Token MakeToken(string strToken) { 298 207 Token token = new Token(TokenTypeEnum.String, strToken); 299 if (strToken.Equals(AtToken.stringValue)) { 300 return AtToken; 301 } else if (strToken.Equals(AssignmentToken.stringValue)) { 302 return AssignmentToken; 303 } else if (strToken.Equals(SeparatorToken.stringValue)) { 208 if (strToken.Equals(SeparatorToken.stringValue)) { 304 209 return SeparatorToken; 305 } else if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) {306 token.type = TokenTypeEnum.Int;307 return token;308 210 } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) { 309 211 token.type = TokenTypeEnum.Double; … … 335 237 336 238 #region parsing 337 private void Parse( bool strict) {338 Parse MetaData(strict);239 private void Parse() { 240 ParseVariableNames(); 339 241 if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 340 Parse SampleData(strict);341 if ( samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);342 } 343 344 private void Parse SampleData(bool strict) {242 ParseValues(); 243 if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 244 } 245 246 private void ParseValues() { 345 247 while (tokenizer.HasNext()) { 346 248 List<double> row = new List<double>(); 347 row.Add(NextValue(tokenizer, strict)); 348 while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) { 349 Expect(Tokenizer.SeparatorToken); 350 row.Add(NextValue(tokenizer, strict)); 351 } 352 Expect(Tokenizer.NewlineToken); 353 // when parsing strictly all rows have to have the same number of values 354 if (strict) { 355 // the first row defines how many samples are needed 356 if (samplesList.Count > 0 && samplesList[0].Count != row.Count) { 357 Error("The first row of the dataset has " + samplesList[0].Count + " columns." + 358 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); 359 } 360 } else if (samplesList.Count > 0) { 361 // when we are not strict then fill or drop elements as needed 362 if (samplesList[0].Count > row.Count) { 363 // fill with NAN 364 for (int i = row.Count; i < samplesList[0].Count; i++) { 365 row.Add(double.NaN); 366 } 367 } else if (samplesList[0].Count < row.Count) { 368 // drop last k elements where k = n - length of first row 369 row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count); 370 } 371 } 372 249 row.Add(NextValue(tokenizer)); 250 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 251 Expect(tokenizer.SeparatorToken); 252 row.Add(NextValue(tokenizer)); 253 } 254 Expect(tokenizer.NewlineToken); 255 // all rows have to have the same number of values 256 // the first row defines how many samples are needed 257 if (rowValues.Count > 0 && rowValues[0].Count != row.Count) { 258 Error("The first row of the dataset has " + rowValues[0].Count + " columns." + 259 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); 260 } 373 261 // add the current row to the collection of rows and start a new row 374 samplesList.Add(row);262 rowValues.Add(row); 375 263 row = new List<double>(); 376 264 } 377 265 } 378 266 379 private double NextValue(Tokenizer tokenizer , bool strict) {380 if (tokenizer.Peek() == Tokenizer.SeparatorToken || tokenizer.Peek() == Tokenizer.NewlineToken) return double.NaN;267 private double NextValue(Tokenizer tokenizer) { 268 if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN; 381 269 Token current = tokenizer.Next(); 382 if (current.type == TokenTypeEnum.Se miColon|| current.type == TokenTypeEnum.String) {270 if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) { 383 271 return double.NaN; 384 272 } else if (current.type == TokenTypeEnum.Double) { 385 273 // just take the value 386 274 return current.doubleValue; 387 } else if (current.type == TokenTypeEnum.Int) { 388 // translate the int value to double 389 return (double)current.intValue; 390 } else { 391 // found an unexpected token => throw error when parsing strictly 392 // when we are parsing non-strictly we also allow unreadable values inserting NAN instead 393 if (strict) { 394 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber); 395 } else { 396 return double.NaN; 397 } 398 } 399 return double.NaN; 400 } 401 402 private void ParseMetaData(bool strict) { 403 while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.AtToken) { 404 Expect(Tokenizer.AtToken); 405 406 Token nameToken = tokenizer.Next(); 407 Expect(Tokenizer.AssignmentToken); 275 } 276 // found an unexpected token => throw error 277 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber); 278 // this line is never executed because Error() throws an exception 279 throw new InvalidOperationException(); 280 } 281 282 private void ParseVariableNames() { 283 // if the first line doesn't start with a double value then we assume that the 284 // first line contains variable names 285 if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) { 408 286 409 287 List<Token> tokens = new List<Token>(); … … 411 289 valueToken = tokenizer.Next(); 412 290 tokens.Add(valueToken); 413 while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) {414 Expect( Tokenizer.SeparatorToken);291 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 292 Expect(tokenizer.SeparatorToken); 415 293 valueToken = tokenizer.Next(); 416 if (valueToken != Tokenizer.NewlineToken) {294 if (valueToken != tokenizer.NewlineToken) { 417 295 tokens.Add(valueToken); 418 296 } 419 297 } 420 if (valueToken != Tokenizer.NewlineToken) {421 Expect( Tokenizer.NewlineToken);422 } 423 metadata[nameToken.stringValue] = tokens;298 if (valueToken != tokenizer.NewlineToken) { 299 Expect(tokenizer.NewlineToken); 300 } 301 variableNames = tokens.Select(x => x.stringValue.Trim()).ToList(); 424 302 } 425 303 }
Note: See TracChangeset
for help on using the changeset viewer.