- Timestamp:
- 04/04/10 18:53:55 (15 years ago)
- Location:
- trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3
- Files:
-
- 2 added
- 3 edited
- 2 copied
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/CsvFileParser.cs
r3262 r3264 1 1 #region License Information 2 2 /* HeuristicLab 3 * Copyright (C) 2002-20 08Heuristic and Evolutionary Algorithms Laboratory (HEAL)3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 4 * 5 5 * This file is part of HeuristicLab. … … 28 28 using System.Text; 29 29 30 namespace HeuristicLab.DataAnalysis { 31 public class DatasetParser { 32 private const string PROBLEMNAME = "PROBLEMNAME"; 30 namespace HeuristicLab.Problems.DataAnalysis.Regression { 31 public class CsvFileParser { 33 32 private const string VARIABLENAMES = "VARIABLENAMES"; 34 private const string TARGETVARIABLE = "TARGETVARIABLE";35 private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";36 private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";37 private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";38 private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";39 private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";40 private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";41 private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";42 private const string TESTSAMPLESEND = "TESTSAMPLESEND";43 private const string NONINPUTVARIABLES = "NONINPUTVARIABLES";44 33 private Tokenizer tokenizer; 45 private Dictionary<string, List<Token>> metadata;46 private List<List<double>> samplesList;34 private List<string> variableNames; 35 private List<List<double>> rowValues; 47 36 48 37 private int rows; … … 58 47 } 59 48 60 private double[ ] samples;61 public double[ ] Samples {49 private double[,] values; 50 public double[,] Values { 62 51 get { 63 return samples;64 } 65 } 66 67 public string ProblemName{52 return values; 53 } 54 } 55 56 public IEnumerable<string> VariableNames { 68 57 get { 69 if (metadata.ContainsKey(PROBLEMNAME)) { 70 return metadata[PROBLEMNAME][0].stringValue; 71 } else return "-"; 72 } 73 } 74 75 public string[] VariableNames { 76 get { 77 if (metadata.ContainsKey(VARIABLENAMES)) { 78 List<Token> nameList = metadata[VARIABLENAMES]; 79 string[] names = new string[nameList.Count]; 80 for (int i = 0; i < names.Length; i++) { 81 names[i] = nameList[i].stringValue; 82 } 83 return names; 84 } else { 58 if (variableNames.Count > 0) return variableNames; 59 else { 85 60 string[] names = new string[columns]; 86 61 for (int i = 0; i < names.Length; i++) { … … 92 67 } 93 68 94 public int TargetVariable { 95 get { 96 if (metadata.ContainsKey(TARGETVARIABLE)) { 97 return metadata[TARGETVARIABLE][0].intValue; 98 } else return 0; // default is the first column 99 } 100 } 101 102 public int MaxTreeHeight { 103 get { 104 if (metadata.ContainsKey(MAXIMUMTREEHEIGHT)) { 105 return metadata[MAXIMUMTREEHEIGHT][0].intValue; 106 } else return 0; 107 } 108 } 109 110 public int MaxTreeSize { 111 get { 112 if (metadata.ContainsKey(MAXIMUMTREESIZE)) { 113 return metadata[MAXIMUMTREESIZE][0].intValue; 114 } else return 0; 115 } 116 } 117 118 public int TrainingSamplesStart { 119 get { 120 if (metadata.ContainsKey(TRAININGSAMPLESSTART)) { 121 return metadata[TRAININGSAMPLESSTART][0].intValue; 122 } else return 0; 123 } 124 } 125 126 public int TrainingSamplesEnd { 127 get { 128 if (metadata.ContainsKey(TRAININGSAMPLESEND)) { 129 return metadata[TRAININGSAMPLESEND][0].intValue; 130 } else return rows; 131 } 132 } 133 public int ValidationSamplesStart { 134 get { 135 if (metadata.ContainsKey(VALIDATIONSAMPLESSTART)) { 136 return metadata[VALIDATIONSAMPLESSTART][0].intValue; 137 } else return 0; 138 } 139 } 140 141 public int ValidationSamplesEnd { 142 get { 143 if (metadata.ContainsKey(VALIDATIONSAMPLESEND)) { 144 return metadata[VALIDATIONSAMPLESEND][0].intValue; 145 } else return rows; 146 } 147 } 148 public int TestSamplesStart { 149 get { 150 if (metadata.ContainsKey(TESTSAMPLESSTART)) { 151 return metadata[TESTSAMPLESSTART][0].intValue; 152 } else return 0; 153 } 154 } 155 156 public int TestSamplesEnd { 157 get { 158 if (metadata.ContainsKey(TESTSAMPLESEND)) { 159 return metadata[TESTSAMPLESEND][0].intValue; 160 } else return rows; 161 } 162 } 163 164 public List<int> NonInputVariables { 165 get { 166 List<int> disallowedVariables = new List<int>(); 167 if (metadata.ContainsKey(NONINPUTVARIABLES)) { 168 foreach (Token t in metadata[NONINPUTVARIABLES]) { 169 disallowedVariables.Add(t.intValue); 170 } 171 } 172 return disallowedVariables; 173 } 174 } 175 176 public DatasetParser() { 177 this.metadata = new Dictionary<string, List<Token>>(); 178 samplesList = new List<List<double>>(); 179 } 180 181 public void Reset() { 182 metadata.Clear(); 183 samplesList.Clear(); 184 } 185 186 public void Import(string importFileName, bool strict) { 187 TryParse(importFileName, strict); 69 public CsvFileParser() { 70 rowValues = new List<List<double>>(); 71 variableNames = new List<string>(); 72 } 73 74 private void Reset() { 75 variableNames.Clear(); 76 rowValues.Clear(); 77 } 78 79 public void Parse(string fileName) { 80 TryParse(fileName); 188 81 // translate the list of samples into a DoubleMatrixData item 189 samples = new double[samplesList.Count * samplesList[0].Count];190 rows = samplesList.Count;191 columns = samplesList[0].Count;192 193 int i= 0;194 int j= 0;195 foreach (List<double> row in samplesList) {196 j= 0;82 rows = rowValues.Count; 83 columns = rowValues[0].Count; 84 values = new double[rows, columns]; 85 86 int rowIndex = 0; 87 int columnIndex = 0; 88 foreach (List<double> row in rowValues) { 89 columnIndex = 0; 197 90 foreach (double element in row) { 198 samples[i * columns + j] = element; 199 j++; 200 } 201 i++; 202 } 203 } 204 205 private void TryParse(string importFileName, bool strict) { 91 values[rowIndex, columnIndex++] = element; 92 } 93 rowIndex++; 94 } 95 } 96 97 private void TryParse(string fileName) { 206 98 Exception lastEx = null; 207 NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo , CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo};99 NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo }; 208 100 foreach (NumberFormatInfo numberFormat in possibleFormats) { 209 using (StreamReader reader = new StreamReader( importFileName)) {101 using (StreamReader reader = new StreamReader(fileName)) { 210 102 tokenizer = new Tokenizer(reader, numberFormat); 211 103 try { 212 104 // parse the file 213 Parse( strict);105 Parse(); 214 106 return; // parsed without errors -> return; 215 107 } … … 225 117 #region tokenizer 226 118 internal enum TokenTypeEnum { 227 At, Assign, NewLine, SemiColon, String, Double, Int119 NewLine, Separator, String, Double 228 120 } 229 121 … … 232 124 public string stringValue; 233 125 public double doubleValue; 234 public int intValue;235 126 236 127 public Token(TokenTypeEnum type, string value) { … … 238 129 stringValue = value; 239 130 doubleValue = 0.0; 240 intValue = 0;241 131 } 242 132 … … 247 137 248 138 249 class Tokenizer {139 internal class Tokenizer { 250 140 private StreamReader reader; 251 141 private List<Token> tokens; 252 142 private NumberFormatInfo numberFormatInfo; 253 143 254 public int CurrentLineNumber = 0; 255 public string CurrentLine; 256 257 public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n"); 258 public static Token AtToken = new Token(TokenTypeEnum.At, "@"); 259 public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "="); 260 public static Token SeparatorToken = new Token(TokenTypeEnum.SemiColon, ";"); 261 262 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) { 144 private int currentLineNumber = 0; 145 public int CurrentLineNumber { 146 get { return currentLineNumber; } 147 private set { currentLineNumber = value; } 148 } 149 private string currentLine; 150 public string CurrentLine { 151 get { return currentLine; } 152 private set { currentLine = value; } 153 } 154 155 private Token newlineToken; 156 public Token NewlineToken { 157 get { return newlineToken; } 158 private set { newlineToken = value; } 159 } 160 private Token separatorToken; 161 public Token SeparatorToken { 162 get { return separatorToken; } 163 private set { separatorToken = value; } 164 } 165 166 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) { 263 167 this.reader = reader; 264 168 this.numberFormatInfo = numberFormatInfo; 169 separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString()); 170 newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine); 265 171 tokens = new List<Token>(); 266 172 ReadNextTokens(); 173 } 174 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) 175 : this(reader, numberFormatInfo, ';') { 267 176 } 268 177 … … 284 193 StringBuilder subStr = new StringBuilder(); 285 194 foreach (char c in line) { 286 if (c == ' @' || c == '=' || c == ';') {195 if (c == ';') { 287 196 yield return subStr.ToString(); 288 197 subStr = new StringBuilder(); … … 297 206 private Token MakeToken(string strToken) { 298 207 Token token = new Token(TokenTypeEnum.String, strToken); 299 if (strToken.Equals(AtToken.stringValue)) { 300 return AtToken; 301 } else if (strToken.Equals(AssignmentToken.stringValue)) { 302 return AssignmentToken; 303 } else if (strToken.Equals(SeparatorToken.stringValue)) { 208 if (strToken.Equals(SeparatorToken.stringValue)) { 304 209 return SeparatorToken; 305 } else if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) {306 token.type = TokenTypeEnum.Int;307 return token;308 210 } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) { 309 211 token.type = TokenTypeEnum.Double; … … 335 237 336 238 #region parsing 337 private void Parse( bool strict) {338 Parse MetaData(strict);239 private void Parse() { 240 ParseVariableNames(); 339 241 if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 340 Parse SampleData(strict);341 if ( samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);342 } 343 344 private void Parse SampleData(bool strict) {242 ParseValues(); 243 if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 244 } 245 246 private void ParseValues() { 345 247 while (tokenizer.HasNext()) { 346 248 List<double> row = new List<double>(); 347 row.Add(NextValue(tokenizer, strict)); 348 while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) { 349 Expect(Tokenizer.SeparatorToken); 350 row.Add(NextValue(tokenizer, strict)); 351 } 352 Expect(Tokenizer.NewlineToken); 353 // when parsing strictly all rows have to have the same number of values 354 if (strict) { 355 // the first row defines how many samples are needed 356 if (samplesList.Count > 0 && samplesList[0].Count != row.Count) { 357 Error("The first row of the dataset has " + samplesList[0].Count + " columns." + 358 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); 359 } 360 } else if (samplesList.Count > 0) { 361 // when we are not strict then fill or drop elements as needed 362 if (samplesList[0].Count > row.Count) { 363 // fill with NAN 364 for (int i = row.Count; i < samplesList[0].Count; i++) { 365 row.Add(double.NaN); 366 } 367 } else if (samplesList[0].Count < row.Count) { 368 // drop last k elements where k = n - length of first row 369 row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count); 370 } 371 } 372 249 row.Add(NextValue(tokenizer)); 250 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 251 Expect(tokenizer.SeparatorToken); 252 row.Add(NextValue(tokenizer)); 253 } 254 Expect(tokenizer.NewlineToken); 255 // all rows have to have the same number of values 256 // the first row defines how many samples are needed 257 if (rowValues.Count > 0 && rowValues[0].Count != row.Count) { 258 Error("The first row of the dataset has " + rowValues[0].Count + " columns." + 259 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); 260 } 373 261 // add the current row to the collection of rows and start a new row 374 samplesList.Add(row);262 rowValues.Add(row); 375 263 row = new List<double>(); 376 264 } 377 265 } 378 266 379 private double NextValue(Tokenizer tokenizer , bool strict) {380 if (tokenizer.Peek() == Tokenizer.SeparatorToken || tokenizer.Peek() == Tokenizer.NewlineToken) return double.NaN;267 private double NextValue(Tokenizer tokenizer) { 268 if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN; 381 269 Token current = tokenizer.Next(); 382 if (current.type == TokenTypeEnum.Se miColon|| current.type == TokenTypeEnum.String) {270 if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) { 383 271 return double.NaN; 384 272 } else if (current.type == TokenTypeEnum.Double) { 385 273 // just take the value 386 274 return current.doubleValue; 387 } else if (current.type == TokenTypeEnum.Int) { 388 // translate the int value to double 389 return (double)current.intValue; 390 } else { 391 // found an unexpected token => throw error when parsing strictly 392 // when we are parsing non-strictly we also allow unreadable values inserting NAN instead 393 if (strict) { 394 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber); 395 } else { 396 return double.NaN; 397 } 398 } 399 return double.NaN; 400 } 401 402 private void ParseMetaData(bool strict) { 403 while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.AtToken) { 404 Expect(Tokenizer.AtToken); 405 406 Token nameToken = tokenizer.Next(); 407 Expect(Tokenizer.AssignmentToken); 275 } 276 // found an unexpected token => throw error 277 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber); 278 // this line is never executed because Error() throws an exception 279 throw new InvalidOperationException(); 280 } 281 282 private void ParseVariableNames() { 283 // if the first line doesn't start with a double value then we assume that the 284 // first line contains variable names 285 if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) { 408 286 409 287 List<Token> tokens = new List<Token>(); … … 411 289 valueToken = tokenizer.Next(); 412 290 tokens.Add(valueToken); 413 while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) {414 Expect( Tokenizer.SeparatorToken);291 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 292 Expect(tokenizer.SeparatorToken); 415 293 valueToken = tokenizer.Next(); 416 if (valueToken != Tokenizer.NewlineToken) {294 if (valueToken != tokenizer.NewlineToken) { 417 295 tokens.Add(valueToken); 418 296 } 419 297 } 420 if (valueToken != Tokenizer.NewlineToken) {421 Expect( Tokenizer.NewlineToken);422 } 423 metadata[nameToken.stringValue] = tokens;298 if (valueToken != tokenizer.NewlineToken) { 299 Expect(tokenizer.NewlineToken); 300 } 301 variableNames = tokens.Select(x => x.stringValue.Trim()).ToList(); 424 302 } 425 303 } -
trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/DataFormatException.cs
r3262 r3264 1 1 #region License Information 2 2 /* HeuristicLab 3 * Copyright (C) 2002-20 08Heuristic and Evolutionary Algorithms Laboratory (HEAL)3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 4 * 5 5 * This file is part of HeuristicLab. … … 25 25 using System.Text; 26 26 27 namespace HeuristicLab. DataAnalysis{28 public class DataFormatException : Exception {27 namespace HeuristicLab.Problems.DataAnalysis.Regression { 28 public class DataFormatException : Exception { 29 29 private int line; 30 30 public int Line { … … 35 35 get { return token; } 36 36 } 37 public DataFormatException(string message, string token, int line) : base(message+"\nToken: " + token + " (line: " + line + ")"){ 37 public DataFormatException(string message, string token, int line) 38 : base(message + "\nToken: " + token + " (line: " + line + ")") { 38 39 this.token = token; 39 40 this.line = line; -
trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/HeuristicLab.Problems.DataAnalysis.Regression-3.3.csproj
r3257 r3264 85 85 <None Include="HeuristicLabProblemsDataAnalysisRegressionPlugin.cs.frame" /> 86 86 <None Include="Properties\AssemblyInfo.frame" /> 87 <Compile Include="CsvFileParser.cs" /> 88 <Compile Include="DataFormatException.cs" /> 87 89 <Compile Include="Symbolic\ArithmeticExpressionGrammar.cs" /> 88 90 <Compile Include="Symbolic\SimpleArithmeticExpressionEvaluator.cs" /> … … 95 97 <Compile Include="Properties\AssemblyInfo.cs" /> 96 98 <Compile Include="Symbolic\SymbolicRegressionProblemView.cs"> 97 <SubType>UserControl</SubType>98 99 </Compile> 99 100 <Compile Include="Symbolic\SymbolicRegressionProblemView.Designer.cs"> … … 104 105 <Compile Include="Symbolic\Symbols\Variable.cs" /> 105 106 <Compile Include="Symbolic\Symbols\VariableTreeNode.cs" /> 107 <Compile Include="RegressionProblemView.cs"> 108 <SubType>UserControl</SubType> 109 </Compile> 110 <Compile Include="RegressionProblemView.Designer.cs"> 111 <DependentUpon>RegressionProblemView.cs</DependentUpon> 112 </Compile> 106 113 </ItemGroup> 107 114 <ItemGroup> -
trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/RegressionProblem.cs
r3253 r3264 30 30 using HeuristicLab.Problems.DataAnalysis; 31 31 using System.Drawing; 32 using System.IO; 32 33 33 34 namespace HeuristicLab.Problems.DataAnalysis.Regression { … … 69 70 } 70 71 #endregion 72 #region properties 73 public Dataset Dataset { 74 get { return DatasetParameter.Value; } 75 set { DatasetParameter.Value = value; } 76 } 77 public StringValue TargetVariable { 78 get { return TargetVariableParameter.Value; } 79 set { TargetVariableParameter.Value = value; } 80 } 81 public ItemList<StringValue> InputVariables { 82 get { return InputVariablesParameter.Value; } 83 set { InputVariablesParameter.Value = value; } 84 } 85 public IntValue TrainingSamplesStart { 86 get { return TrainingSamplesStartParameter.Value; } 87 set { TrainingSamplesStartParameter.Value = value; } 88 } 89 public IntValue TrainingSamplesEnd { 90 get { return TrainingSamplesEndParameter.Value; } 91 set { TrainingSamplesEndParameter.Value = value; } 92 } 93 public IntValue ValidationSamplesStart { 94 get { return ValidationSamplesStartParameter.Value; } 95 set { ValidationSamplesStartParameter.Value = value; } 96 } 97 public IntValue ValidationSamplesEnd { 98 get { return ValidationSamplesEndParameter.Value; } 99 set { ValidationSamplesEndParameter.Value = value; } 100 } 101 public IntValue TestSamplesStart { 102 get { return TestSamplesStartParameter.Value; } 103 set { TestSamplesStartParameter.Value = value; } 104 } 105 public IntValue TestSamplesEnd { 106 get { return TestSamplesEndParameter.Value; } 107 set { TestSamplesEndParameter.Value = value; } 108 } 109 #endregion 71 110 72 111 public RegressionProblem() … … 88 127 private RegressionProblem(bool deserializing) : base() { } 89 128 90 #region ISingleObjectiveProblem Members 91 92 public IParameter MaximizationParameter { 93 get { throw new NotImplementedException(); } 129 public virtual void ImportFromFile(string fileName) { 130 var csvFileParser = new CsvFileParser(); 131 csvFileParser.Parse(fileName); 132 Name = "Regression Problem (imported from " + Path.GetFileName(fileName); 133 Dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values); 134 Dataset.Name = Path.GetFileName(fileName); 135 TargetVariable = new StringValue(Dataset.VariableNames.First()); 136 InputVariables = new ItemList<StringValue>(Dataset.VariableNames.Skip(1).Select(s => new StringValue(s))); 137 TrainingSamplesStart = new IntValue(0); 138 TrainingSamplesEnd = new IntValue(csvFileParser.Rows); 139 TestSamplesStart = new IntValue(0); 140 TestSamplesEnd = new IntValue(csvFileParser.Rows); 94 141 } 95 96 public IParameter BestKnownQualityParameter {97 get { throw new NotImplementedException(); }98 }99 100 public ISingleObjectiveEvaluator Evaluator {101 get { throw new NotImplementedException(); }102 }103 104 #endregion105 142 } 106 143 } -
trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/SymbolicRegressionProblemView.cs
r3253 r3264 12 12 [View("SymbolicRegressionProblem View")] 13 13 [Content(typeof(SymbolicRegressionProblem), true)] 14 public partial class SymbolicRegressionProblemView : ProblemView {14 public partial class SymbolicRegressionProblemView : RegressionProblemView { 15 15 public new SymbolicRegressionProblem Content { 16 16 get { return (SymbolicRegressionProblem)base.Content; }
Note: See TracChangeset
for help on using the changeset viewer.