Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
08/20/10 17:42:27 (14 years ago)
Author:
gkronber
Message:

Worked on overfitting analyzer and CPP. #1142

Location:
branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer.cs

    r4271 r4272  
    140140      get { return (ILookupParameter<SymbolicRegressionSolution>)Parameters[BestSolutionParameterName]; }
    141141    }
     142    public ILookupParameter<SymbolicRegressionSolution> BestTrainingSolutionParameter {
     143      get { return (ILookupParameter<SymbolicRegressionSolution>)Parameters["BestTrainingSolution"]; }
     144    }
     145    public ScopeTreeLookupParameter<DoubleValue> QualityParameter {
     146      get { return (ScopeTreeLookupParameter<DoubleValue>)Parameters["Quality"]; }
     147    }
     148
    142149    public ILookupParameter<IntValue> GenerationsParameter {
    143150      get { return (ILookupParameter<IntValue>)Parameters[GenerationsParameterName]; }
     
    228235      Parameters.Add(new ValueLookupParameter<DoubleValue>(LowerEstimationLimitParameterName, "The lower estimation limit that was set for the evaluation of the symbolic expression trees."));
    229236      Parameters.Add(new LookupParameter<SymbolicRegressionSolution>(BestSolutionParameterName, "The best symbolic regression solution."));
     237      Parameters.Add(new LookupParameter<SymbolicRegressionSolution>("BestTrainingSolution"));
     238      Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality"));
    230239      Parameters.Add(new LookupParameter<IntValue>(GenerationsParameterName, "The number of generations calculated so far."));
    231240      Parameters.Add(new LookupParameter<DoubleValue>(BestSolutionQualityParameterName, "The quality of the best symbolic regression solution."));
     
    252261        Parameters.Add(new LookupParameter<DataTable>(BestSolutionQualityValuesParameterName));
    253262      }
     263      if (!Parameters.ContainsKey("BestTrainingSolution")) {
     264        Parameters.Add(new LookupParameter<SymbolicRegressionSolution>("BestTrainingSolution"));
     265      }
     266      if (!Parameters.ContainsKey("Quality")) {
     267        Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality"));
     268      }
    254269      #endregion
    255270    }
    256271
    257272    public override IOperation Apply() {
    258       var trees = SymbolicExpressionTree;
     273      ItemArray<SymbolicExpressionTree> trees = SymbolicExpressionTree;
     274      ItemArray<DoubleValue> qualities = QualityParameter.ActualValue;
    259275
    260276      string targetVariable = ProblemData.TargetVariable.Value;
     
    273289      double bestQuality = Maximization.Value ? double.NegativeInfinity : double.PositiveInfinity;
    274290      SymbolicExpressionTree bestTree = null;
    275 
    276       foreach (var tree in trees) {
     291      SymbolicExpressionTree bestTrainingTree = trees[0];
     292      double bestTrainingQuality = qualities[0].Value;
     293      for (int i = 0; i < trees.Length; i++) {
     294        SymbolicExpressionTree tree = trees[i];
    277295        double quality = Evaluator.Evaluate(SymbolicExpressionTreeInterpreter, tree,
    278296          lowerEstimationLimit, upperEstimationLimit,
     
    285303          bestTree = tree;
    286304        }
    287       }
     305        if ((Maximization.Value && qualities[i].Value > bestTrainingQuality) ||
     306            (!Maximization.Value && qualities[i].Value < bestTrainingQuality)) {
     307          bestTrainingQuality = qualities[i].Value;
     308          bestTrainingTree = tree;
     309        }
     310      }
     311
     312      var scaledBestTrainingTree = GetScaledTree(bestTrainingTree);
     313
     314      SymbolicRegressionSolution bestTrainingSolution = new SymbolicRegressionSolution(ProblemData,
     315        new SymbolicRegressionModel(SymbolicExpressionTreeInterpreter, scaledBestTrainingTree),
     316        lowerEstimationLimit, upperEstimationLimit);
     317      bestTrainingSolution.Name = "Best solution (training)";
     318      bestTrainingSolution.Description = "The solution of the population with the highest fitness";
    288319
    289320      // if the best validation tree is better than the current best solution => update
     
    293324        (!Maximization.Value && bestQuality < BestSolutionQuality.Value);
    294325      if (newBest) {
    295         // calculate scaling parameters and only for the best tree using the full training set
    296         double alpha, beta;
    297         int trainingStart = ProblemData.TrainingSamplesStart.Value;
    298         int trainingEnd = ProblemData.TrainingSamplesEnd.Value;
    299         IEnumerable<int> trainingRows = Enumerable.Range(trainingStart, trainingEnd - trainingStart);
    300         IEnumerable<double> originalValues = ProblemData.Dataset.GetEnumeratedVariableValues(targetVariable, trainingRows);
    301         IEnumerable<double> estimatedValues = SymbolicExpressionTreeInterpreter.GetSymbolicExpressionTreeValues(bestTree, ProblemData.Dataset, trainingRows);
    302 
    303         SymbolicRegressionScaledMeanSquaredErrorEvaluator.CalculateScalingParameters(originalValues, estimatedValues, out beta, out alpha);
    304 
    305         // scale tree for solution
    306         var scaledTree = SymbolicRegressionSolutionLinearScaler.Scale(bestTree, alpha, beta);
     326        var scaledTree = GetScaledTree(bestTree);
    307327        var model = new SymbolicRegressionModel((ISymbolicExpressionTreeInterpreter)SymbolicExpressionTreeInterpreter.Clone(),
    308328          scaledTree);
     
    323343        Results.Add(new Result(BestSolutionQualityParameterName, new DoubleValue()));
    324344        Results.Add(new Result(CurrentBestValidationQualityParameterName, new DoubleValue()));
     345        Results.Add(new Result("Best solution (training)", bestTrainingSolution));
    325346      }
    326347      Results[BestSolutionQualityParameterName].Value = new DoubleValue(BestSolutionQualityParameter.ActualValue.Value);
    327348      Results[CurrentBestValidationQualityParameterName].Value = new DoubleValue(bestQuality);
     349      Results["Best solution (training)"].Value = bestTrainingSolution;
    328350
    329351      DataTable validationValues = (DataTable)Results[BestSolutionQualityValuesParameterName].Value;
     
    332354
    333355      BestSolutionQualityValuesParameter.ActualValue = validationValues;
    334      
     356
    335357      return base.Apply();
     358    }
     359
     360    private SymbolicExpressionTree GetScaledTree(SymbolicExpressionTree tree) {
     361      // calculate scaling parameters and only for the best tree using the full training set
     362      double alpha, beta;
     363      int trainingStart = ProblemData.TrainingSamplesStart.Value;
     364      int trainingEnd = ProblemData.TrainingSamplesEnd.Value;
     365      IEnumerable<int> trainingRows = Enumerable.Range(trainingStart, trainingEnd - trainingStart);
     366      IEnumerable<double> originalValues = ProblemData.Dataset.GetEnumeratedVariableValues(ProblemData.TargetVariable.Value, trainingRows);
     367      IEnumerable<double> estimatedValues = SymbolicExpressionTreeInterpreter.GetSymbolicExpressionTreeValues(tree, ProblemData.Dataset, trainingRows);
     368
     369      SymbolicRegressionScaledMeanSquaredErrorEvaluator.CalculateScalingParameters(originalValues, estimatedValues, out beta, out alpha);
     370
     371      // scale tree for solution
     372      return SymbolicRegressionSolutionLinearScaler.Scale(tree, alpha, beta);
    336373    }
    337374
  • branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/OverfittingAnalyzer.cs

    r4271 r4272  
    2323using System.Linq;
    2424using HeuristicLab.Analysis;
     25using HeuristicLab.Common;
    2526using HeuristicLab.Core;
    2627using HeuristicLab.Data;
     
    9192      get { return (ILookupParameter<PercentValue>)Parameters["RelativeValidationQuality"]; }
    9293    }
     94    //public IValueLookupParameter<PercentValue> RelativeValidationQualityLowerLimitParameter {
     95    //  get { return (IValueLookupParameter<PercentValue>)Parameters["RelativeValidationQualityLowerLimit"]; }
     96    //}
     97    //public IValueLookupParameter<PercentValue> RelativeValidationQualityUpperLimitParameter {
     98    //  get { return (IValueLookupParameter<PercentValue>)Parameters["RelativeValidationQualityUpperLimit"]; }
     99    //}
    93100    public ILookupParameter<DoubleValue> TrainingValidationQualityCorrelationParameter {
    94101      get { return (ILookupParameter<DoubleValue>)Parameters["TrainingValidationCorrelation"]; }
     
    102109    public ILookupParameter<ResultCollection> ResultsParameter {
    103110      get { return (ILookupParameter<ResultCollection>)Parameters["Results"]; }
     111    }
     112    public ILookupParameter<DoubleValue> InitialTrainingQualityParameter {
     113      get { return (ILookupParameter<DoubleValue>)Parameters["InitialTrainingQuality"]; }
    104114    }
    105115    #endregion
     
    156166      Parameters.Add(new ValueLookupParameter<DoubleValue>(LowerEstimationLimitParameterName, "The lower estimation limit that was set for the evaluation of the symbolic expression trees."));
    157167      Parameters.Add(new LookupParameter<PercentValue>("RelativeValidationQuality"));
     168      //Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeValidationQualityUpperLimit", new PercentValue(0.05)));
     169      //Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeValidationQualityLowerLimit", new PercentValue(-0.05)));
    158170      Parameters.Add(new LookupParameter<DoubleValue>("TrainingValidationCorrelation"));
    159171      Parameters.Add(new ValueLookupParameter<DoubleValue>("CorrelationLimit", new DoubleValue(0.65)));
    160172      Parameters.Add(new LookupParameter<BoolValue>("Overfitting"));
    161173      Parameters.Add(new LookupParameter<ResultCollection>("Results"));
     174      Parameters.Add(new LookupParameter<DoubleValue>("InitialTrainingQuality"));
    162175    }
    163176
     
    167180    [StorableHook(HookType.AfterDeserialization)]
    168181    private void AfterDeserialization() {
     182      if (!Parameters.ContainsKey("InitialTrainingQuality")) {
     183        Parameters.Add(new LookupParameter<DoubleValue>("InitialTrainingQuality"));
     184      }
     185      //if (!Parameters.ContainsKey("RelativeValidationQualityUpperLimit")) {
     186      //  Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeValidationQualityUpperLimit", new PercentValue(0.05)));
     187      //}
     188      //if (!Parameters.ContainsKey("RelativeValidationQualityLowerLimit")) {
     189      //  Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeValidationQualityLowerLimit", new PercentValue(-0.05)));
     190      //}
    169191    }
    170192
     
    205227      //if (RelativeValidationQualityParameter.ActualValue == null) {
    206228      // first call initialize the relative quality using the difference between average training and validation quality
    207       double avgTrainingQuality = qualities.Average(x => x.Value);
    208       double avgValidationQuality = validationQualities.Average();
     229      double avgTrainingQuality = qualities.Select(x => x.Value).Median();
     230      double avgValidationQuality = validationQualities.Median();
    209231
    210232      if (Maximization.Value)
     
    215237      //}
    216238
    217       double[] validationArr = validationQualities.ToArray();
    218       double[] trainingArr = qualities.Select(x => x.Value).ToArray();
    219       double r = alglib.correlation.spearmanrankcorrelation(trainingArr, validationArr, trainingArr.Length);
     239      // cut away 0.0 values to make the correlation stronger
     240      // necessary because R² values of 0.0 are strong outliers
     241      //int percentile = (int)Math.Round(0.1 * validationQualities.Count);
     242      //double validationCutOffValue = validationQualities.OrderBy(x => x).ElementAt(percentile);
     243      //double trainingCutOffValue = qualities.Select(x => x.Value).OrderBy(x => x).ElementAt(percentile);
     244      double validationCutOffValue = 0.05;
     245      double trainingCutOffValue = validationCutOffValue;
     246
     247      double[] validationArr = new double[validationQualities.Count];
     248      double[] trainingArr = new double[validationQualities.Count];
     249      int arrIndex = 0;
     250      for (int i = 0; i < validationQualities.Count; i++) {
     251        if (validationQualities[i] > validationCutOffValue &&
     252            qualities[i].Value > trainingCutOffValue) {
     253          validationArr[arrIndex] = validationQualities[i];
     254          trainingArr[arrIndex] = qualities[i].Value;
     255          arrIndex++;
     256        }
     257      }
     258      double r = alglib.correlation.spearmanrankcorrelation(trainingArr, validationArr, arrIndex);
    220259      TrainingValidationQualityCorrelationParameter.ActualValue = new DoubleValue(r);
    221       OverfittingParameter.ActualValue = new BoolValue(RelativeValidationQualityParameter.ActualValue.Value < 0 && r < CorrelationLimitParameter.ActualValue.Value);
     260      if (InitialTrainingQualityParameter.ActualValue == null)
     261        InitialTrainingQualityParameter.ActualValue = new DoubleValue(avgValidationQuality);
     262      bool overfitting =
     263        avgTrainingQuality > InitialTrainingQualityParameter.ActualValue.Value &&  // better on training than in initial generation
     264        r < CorrelationLimitParameter.ActualValue.Value;  // low correlation between training and validation quality
     265
     266      //// if validation quality is within a certain margin of percentage deviation (default -5% .. 5%) then there is no overfitting
     267      //// correlation is also bad when underfitting but validation quality cannot be a lot larger than training quality if overfitting
     268      //(RelativeValidationQualityParameter.ActualValue.Value > RelativeValidationQualityUpperLimitParameter.ActualValue.Value || // better on training than on validation
     269      // RelativeValidationQualityParameter.ActualValue.Value < RelativeValidationQualityLowerLimitParameter.ActualValue.Value); // better on training than on validation
     270
     271      OverfittingParameter.ActualValue = new BoolValue(overfitting);
    222272      return base.Apply();
    223273    }
Note: See TracChangeset for help on using the changeset viewer.