- Timestamp:
- 08/20/10 17:42:27 (14 years ago)
- Location:
- branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer.cs
r4271 r4272 140 140 get { return (ILookupParameter<SymbolicRegressionSolution>)Parameters[BestSolutionParameterName]; } 141 141 } 142 public ILookupParameter<SymbolicRegressionSolution> BestTrainingSolutionParameter { 143 get { return (ILookupParameter<SymbolicRegressionSolution>)Parameters["BestTrainingSolution"]; } 144 } 145 public ScopeTreeLookupParameter<DoubleValue> QualityParameter { 146 get { return (ScopeTreeLookupParameter<DoubleValue>)Parameters["Quality"]; } 147 } 148 142 149 public ILookupParameter<IntValue> GenerationsParameter { 143 150 get { return (ILookupParameter<IntValue>)Parameters[GenerationsParameterName]; } … … 228 235 Parameters.Add(new ValueLookupParameter<DoubleValue>(LowerEstimationLimitParameterName, "The lower estimation limit that was set for the evaluation of the symbolic expression trees.")); 229 236 Parameters.Add(new LookupParameter<SymbolicRegressionSolution>(BestSolutionParameterName, "The best symbolic regression solution.")); 237 Parameters.Add(new LookupParameter<SymbolicRegressionSolution>("BestTrainingSolution")); 238 Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality")); 230 239 Parameters.Add(new LookupParameter<IntValue>(GenerationsParameterName, "The number of generations calculated so far.")); 231 240 Parameters.Add(new LookupParameter<DoubleValue>(BestSolutionQualityParameterName, "The quality of the best symbolic regression solution.")); … … 252 261 Parameters.Add(new LookupParameter<DataTable>(BestSolutionQualityValuesParameterName)); 253 262 } 263 if (!Parameters.ContainsKey("BestTrainingSolution")) { 264 Parameters.Add(new LookupParameter<SymbolicRegressionSolution>("BestTrainingSolution")); 265 } 266 if (!Parameters.ContainsKey("Quality")) { 267 Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality")); 268 } 254 269 #endregion 255 270 } 256 271 257 272 public override IOperation Apply() { 258 var trees = SymbolicExpressionTree; 273 ItemArray<SymbolicExpressionTree> trees = SymbolicExpressionTree; 274 ItemArray<DoubleValue> qualities = QualityParameter.ActualValue; 259 275 260 276 string targetVariable = ProblemData.TargetVariable.Value; … … 273 289 double bestQuality = Maximization.Value ? double.NegativeInfinity : double.PositiveInfinity; 274 290 SymbolicExpressionTree bestTree = null; 275 276 foreach (var tree in trees) { 291 SymbolicExpressionTree bestTrainingTree = trees[0]; 292 double bestTrainingQuality = qualities[0].Value; 293 for (int i = 0; i < trees.Length; i++) { 294 SymbolicExpressionTree tree = trees[i]; 277 295 double quality = Evaluator.Evaluate(SymbolicExpressionTreeInterpreter, tree, 278 296 lowerEstimationLimit, upperEstimationLimit, … … 285 303 bestTree = tree; 286 304 } 287 } 305 if ((Maximization.Value && qualities[i].Value > bestTrainingQuality) || 306 (!Maximization.Value && qualities[i].Value < bestTrainingQuality)) { 307 bestTrainingQuality = qualities[i].Value; 308 bestTrainingTree = tree; 309 } 310 } 311 312 var scaledBestTrainingTree = GetScaledTree(bestTrainingTree); 313 314 SymbolicRegressionSolution bestTrainingSolution = new SymbolicRegressionSolution(ProblemData, 315 new SymbolicRegressionModel(SymbolicExpressionTreeInterpreter, scaledBestTrainingTree), 316 lowerEstimationLimit, upperEstimationLimit); 317 bestTrainingSolution.Name = "Best solution (training)"; 318 bestTrainingSolution.Description = "The solution of the population with the highest fitness"; 288 319 289 320 // if the best validation tree is better than the current best solution => update … … 293 324 (!Maximization.Value && bestQuality < BestSolutionQuality.Value); 294 325 if (newBest) { 295 // calculate scaling parameters and only for the best tree using the full training set 296 double alpha, beta; 297 int trainingStart = ProblemData.TrainingSamplesStart.Value; 298 int trainingEnd = ProblemData.TrainingSamplesEnd.Value; 299 IEnumerable<int> trainingRows = Enumerable.Range(trainingStart, trainingEnd - trainingStart); 300 IEnumerable<double> originalValues = ProblemData.Dataset.GetEnumeratedVariableValues(targetVariable, trainingRows); 301 IEnumerable<double> estimatedValues = SymbolicExpressionTreeInterpreter.GetSymbolicExpressionTreeValues(bestTree, ProblemData.Dataset, trainingRows); 302 303 SymbolicRegressionScaledMeanSquaredErrorEvaluator.CalculateScalingParameters(originalValues, estimatedValues, out beta, out alpha); 304 305 // scale tree for solution 306 var scaledTree = SymbolicRegressionSolutionLinearScaler.Scale(bestTree, alpha, beta); 326 var scaledTree = GetScaledTree(bestTree); 307 327 var model = new SymbolicRegressionModel((ISymbolicExpressionTreeInterpreter)SymbolicExpressionTreeInterpreter.Clone(), 308 328 scaledTree); … … 323 343 Results.Add(new Result(BestSolutionQualityParameterName, new DoubleValue())); 324 344 Results.Add(new Result(CurrentBestValidationQualityParameterName, new DoubleValue())); 345 Results.Add(new Result("Best solution (training)", bestTrainingSolution)); 325 346 } 326 347 Results[BestSolutionQualityParameterName].Value = new DoubleValue(BestSolutionQualityParameter.ActualValue.Value); 327 348 Results[CurrentBestValidationQualityParameterName].Value = new DoubleValue(bestQuality); 349 Results["Best solution (training)"].Value = bestTrainingSolution; 328 350 329 351 DataTable validationValues = (DataTable)Results[BestSolutionQualityValuesParameterName].Value; … … 332 354 333 355 BestSolutionQualityValuesParameter.ActualValue = validationValues; 334 356 335 357 return base.Apply(); 358 } 359 360 private SymbolicExpressionTree GetScaledTree(SymbolicExpressionTree tree) { 361 // calculate scaling parameters and only for the best tree using the full training set 362 double alpha, beta; 363 int trainingStart = ProblemData.TrainingSamplesStart.Value; 364 int trainingEnd = ProblemData.TrainingSamplesEnd.Value; 365 IEnumerable<int> trainingRows = Enumerable.Range(trainingStart, trainingEnd - trainingStart); 366 IEnumerable<double> originalValues = ProblemData.Dataset.GetEnumeratedVariableValues(ProblemData.TargetVariable.Value, trainingRows); 367 IEnumerable<double> estimatedValues = SymbolicExpressionTreeInterpreter.GetSymbolicExpressionTreeValues(tree, ProblemData.Dataset, trainingRows); 368 369 SymbolicRegressionScaledMeanSquaredErrorEvaluator.CalculateScalingParameters(originalValues, estimatedValues, out beta, out alpha); 370 371 // scale tree for solution 372 return SymbolicRegressionSolutionLinearScaler.Scale(tree, alpha, beta); 336 373 } 337 374 -
branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/OverfittingAnalyzer.cs
r4271 r4272 23 23 using System.Linq; 24 24 using HeuristicLab.Analysis; 25 using HeuristicLab.Common; 25 26 using HeuristicLab.Core; 26 27 using HeuristicLab.Data; … … 91 92 get { return (ILookupParameter<PercentValue>)Parameters["RelativeValidationQuality"]; } 92 93 } 94 //public IValueLookupParameter<PercentValue> RelativeValidationQualityLowerLimitParameter { 95 // get { return (IValueLookupParameter<PercentValue>)Parameters["RelativeValidationQualityLowerLimit"]; } 96 //} 97 //public IValueLookupParameter<PercentValue> RelativeValidationQualityUpperLimitParameter { 98 // get { return (IValueLookupParameter<PercentValue>)Parameters["RelativeValidationQualityUpperLimit"]; } 99 //} 93 100 public ILookupParameter<DoubleValue> TrainingValidationQualityCorrelationParameter { 94 101 get { return (ILookupParameter<DoubleValue>)Parameters["TrainingValidationCorrelation"]; } … … 102 109 public ILookupParameter<ResultCollection> ResultsParameter { 103 110 get { return (ILookupParameter<ResultCollection>)Parameters["Results"]; } 111 } 112 public ILookupParameter<DoubleValue> InitialTrainingQualityParameter { 113 get { return (ILookupParameter<DoubleValue>)Parameters["InitialTrainingQuality"]; } 104 114 } 105 115 #endregion … … 156 166 Parameters.Add(new ValueLookupParameter<DoubleValue>(LowerEstimationLimitParameterName, "The lower estimation limit that was set for the evaluation of the symbolic expression trees.")); 157 167 Parameters.Add(new LookupParameter<PercentValue>("RelativeValidationQuality")); 168 //Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeValidationQualityUpperLimit", new PercentValue(0.05))); 169 //Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeValidationQualityLowerLimit", new PercentValue(-0.05))); 158 170 Parameters.Add(new LookupParameter<DoubleValue>("TrainingValidationCorrelation")); 159 171 Parameters.Add(new ValueLookupParameter<DoubleValue>("CorrelationLimit", new DoubleValue(0.65))); 160 172 Parameters.Add(new LookupParameter<BoolValue>("Overfitting")); 161 173 Parameters.Add(new LookupParameter<ResultCollection>("Results")); 174 Parameters.Add(new LookupParameter<DoubleValue>("InitialTrainingQuality")); 162 175 } 163 176 … … 167 180 [StorableHook(HookType.AfterDeserialization)] 168 181 private void AfterDeserialization() { 182 if (!Parameters.ContainsKey("InitialTrainingQuality")) { 183 Parameters.Add(new LookupParameter<DoubleValue>("InitialTrainingQuality")); 184 } 185 //if (!Parameters.ContainsKey("RelativeValidationQualityUpperLimit")) { 186 // Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeValidationQualityUpperLimit", new PercentValue(0.05))); 187 //} 188 //if (!Parameters.ContainsKey("RelativeValidationQualityLowerLimit")) { 189 // Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeValidationQualityLowerLimit", new PercentValue(-0.05))); 190 //} 169 191 } 170 192 … … 205 227 //if (RelativeValidationQualityParameter.ActualValue == null) { 206 228 // first call initialize the relative quality using the difference between average training and validation quality 207 double avgTrainingQuality = qualities. Average(x => x.Value);208 double avgValidationQuality = validationQualities. Average();229 double avgTrainingQuality = qualities.Select(x => x.Value).Median(); 230 double avgValidationQuality = validationQualities.Median(); 209 231 210 232 if (Maximization.Value) … … 215 237 //} 216 238 217 double[] validationArr = validationQualities.ToArray(); 218 double[] trainingArr = qualities.Select(x => x.Value).ToArray(); 219 double r = alglib.correlation.spearmanrankcorrelation(trainingArr, validationArr, trainingArr.Length); 239 // cut away 0.0 values to make the correlation stronger 240 // necessary because R² values of 0.0 are strong outliers 241 //int percentile = (int)Math.Round(0.1 * validationQualities.Count); 242 //double validationCutOffValue = validationQualities.OrderBy(x => x).ElementAt(percentile); 243 //double trainingCutOffValue = qualities.Select(x => x.Value).OrderBy(x => x).ElementAt(percentile); 244 double validationCutOffValue = 0.05; 245 double trainingCutOffValue = validationCutOffValue; 246 247 double[] validationArr = new double[validationQualities.Count]; 248 double[] trainingArr = new double[validationQualities.Count]; 249 int arrIndex = 0; 250 for (int i = 0; i < validationQualities.Count; i++) { 251 if (validationQualities[i] > validationCutOffValue && 252 qualities[i].Value > trainingCutOffValue) { 253 validationArr[arrIndex] = validationQualities[i]; 254 trainingArr[arrIndex] = qualities[i].Value; 255 arrIndex++; 256 } 257 } 258 double r = alglib.correlation.spearmanrankcorrelation(trainingArr, validationArr, arrIndex); 220 259 TrainingValidationQualityCorrelationParameter.ActualValue = new DoubleValue(r); 221 OverfittingParameter.ActualValue = new BoolValue(RelativeValidationQualityParameter.ActualValue.Value < 0 && r < CorrelationLimitParameter.ActualValue.Value); 260 if (InitialTrainingQualityParameter.ActualValue == null) 261 InitialTrainingQualityParameter.ActualValue = new DoubleValue(avgValidationQuality); 262 bool overfitting = 263 avgTrainingQuality > InitialTrainingQualityParameter.ActualValue.Value && // better on training than in initial generation 264 r < CorrelationLimitParameter.ActualValue.Value; // low correlation between training and validation quality 265 266 //// if validation quality is within a certain margin of percentage deviation (default -5% .. 5%) then there is no overfitting 267 //// correlation is also bad when underfitting but validation quality cannot be a lot larger than training quality if overfitting 268 //(RelativeValidationQualityParameter.ActualValue.Value > RelativeValidationQualityUpperLimitParameter.ActualValue.Value || // better on training than on validation 269 // RelativeValidationQualityParameter.ActualValue.Value < RelativeValidationQualityLowerLimitParameter.ActualValue.Value); // better on training than on validation 270 271 OverfittingParameter.ActualValue = new BoolValue(overfitting); 222 272 return base.Apply(); 223 273 }
Note: See TracChangeset
for help on using the changeset viewer.