Changeset 4255


Ignore:
Timestamp:
08/18/10 19:46:02 (12 years ago)
Author:
gkronber
Message:

Added complexity reduction scheme based on validation performance for CPP. #1142

Location:
branches/DataAnalysis
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer.cs

    r4244 r4255  
    146146      get { return (ILookupParameter<DoubleValue>)Parameters[BestSolutionQualityParameterName]; }
    147147    }
     148    public ILookupParameter<DataTable> BestSolutionQualityValuesParameter {
     149      get { return (ILookupParameter<DataTable>)Parameters[BestSolutionQualityValuesParameterName]; }
     150    }
    148151    public ILookupParameter<ResultCollection> ResultsParameter {
    149152      get { return (ILookupParameter<ResultCollection>)Parameters[ResultsParameterName]; }
     
    230233      Parameters.Add(new LookupParameter<DoubleValue>(BestKnownQualityParameterName, "The best known (validation) quality achieved on the data set."));
    231234      Parameters.Add(new LookupParameter<DoubleValue>(CurrentBestValidationQualityParameterName, "The quality of the best solution (on the validation set) of the current generation."));
     235      Parameters.Add(new LookupParameter<DataTable>(BestSolutionQualityValuesParameterName));
    232236      Parameters.Add(new LookupParameter<DataTable>(VariableFrequenciesParameterName, "The variable frequencies table to use for the calculation of variable impacts"));
    233237    }
     
    244248      if (!Parameters.ContainsKey(MaximizationParameterName)) {
    245249        Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
     250      }
     251      if (!Parameters.ContainsKey(BestSolutionQualityValuesParameterName)) {
     252        Parameters.Add(new LookupParameter<DataTable>(BestSolutionQualityValuesParameterName));
    246253      }
    247254      #endregion
     
    323330      AddValue(validationValues, BestSolutionQualityParameter.ActualValue.Value, BestSolutionQualityParameterName, BestSolutionQualityParameterName);
    324331      AddValue(validationValues, bestQuality, CurrentBestValidationQualityParameterName, CurrentBestValidationQualityParameterName);
     332
     333      BestSolutionQualityValuesParameter.ActualValue = validationValues;
     334     
    325335      return base.Apply();
    326336    }
  • branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis/3.3/HeuristicLab.Problems.DataAnalysis-3.3.csproj

    r4233 r4255  
    182182  </ItemGroup>
    183183  <ItemGroup>
     184    <ProjectReference Include="..\..\HeuristicLab.Analysis\3.3\HeuristicLab.Analysis-3.3.csproj">
     185      <Project>{887425B4-4348-49ED-A457-B7D2C26DDBF9}</Project>
     186      <Name>HeuristicLab.Analysis-3.3</Name>
     187    </ProjectReference>
    184188    <ProjectReference Include="..\..\HeuristicLab.Collections\3.3\HeuristicLab.Collections-3.3.csproj">
    185189      <Project>{958B43BC-CC5C-4FA2-8628-2B3B01D890B6}</Project>
  • branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis/3.3/Operators/CovariantParsimonyPressure.cs

    r4233 r4255  
    3232using System.Collections.Generic;
    3333using HeuristicLab.Problems.DataAnalysis.Evaluators;
     34using HeuristicLab.Analysis;
    3435
    3536namespace HeuristicLab.Problems.DataAnalysis.Operators {
     
    4344      get { return (IScopeTreeLookupParameter<DoubleValue>)Parameters["Quality"]; }
    4445    }
     46    public IScopeTreeLookupParameter<DoubleValue> AdjustedQualityParameter {
     47      get { return (IScopeTreeLookupParameter<DoubleValue>)Parameters["AdjustedQuality"]; }
     48    }
     49
    4550    public ILookupParameter<BoolValue> MaximizationParameter {
    4651      get { return (ILookupParameter<BoolValue>)Parameters["Maximization"]; }
     
    4954      get { return (IValueLookupParameter<DoubleValue>)Parameters["K"]; }
    5055    }
    51 
     56    public ILookupParameter<IntValue> GenerationsParameter {
     57      get { return (ILookupParameter<IntValue>)Parameters["Generations"]; }
     58    }
     59    public IValueLookupParameter<IntValue> FirstGenerationParameter {
     60      get { return (IValueLookupParameter<IntValue>)Parameters["FirstGenerationParameter"]; }
     61    }
     62    public IValueLookupParameter<BoolValue> AntiOverfitParameter {
     63      get { return (IValueLookupParameter<BoolValue>)Parameters["AntiOverfit"]; }
     64    }
     65    public ILookupParameter<DataTable> ValidationQualityParameter {
     66      get { return (ILookupParameter<DataTable>)Parameters["Validation Quality"]; }
     67    }
     68    public ILookupParameter<DoubleValue> CurrentBestValidationQualityParameter {
     69      get { return (ILookupParameter<DoubleValue>)Parameters["Current best validation quality"]; }
     70    }
     71    public ILookupParameter<DoubleValue> BestValidationQualityParameter {
     72      get { return (ILookupParameter<DoubleValue>)Parameters["Best solution quality (validation)"]; }
     73    }
     74    public ILookupParameter<DoubleValue> LengthCorrelationParameter {
     75      get { return (ILookupParameter<DoubleValue>)Parameters["Correlation(Length, AdjustedFitness)"]; }
     76    }
     77    public ILookupParameter<DoubleValue> FitnessCorrelationParameter {
     78      get { return (ILookupParameter<DoubleValue>)Parameters["Correlation(Fitness, AdjustedFitness)"]; }
     79    }
     80    public IValueLookupParameter<IntValue> GenerationSpanParameter {
     81      get { return (IValueLookupParameter<IntValue>)Parameters["GenerationSpan"]; }
     82    }
     83    public IValueLookupParameter<PercentValue> OverfittingLimitParameter {
     84      get { return (IValueLookupParameter<PercentValue>)Parameters["OverfittingLimit"]; }
     85    }
     86    public IValueLookupParameter<PercentValue> ComplexityAdaptionParameter {
     87      get { return (IValueLookupParameter<PercentValue>)Parameters["ComplexityAdaption"]; }
     88    }
     89    public ILookupParameter<DataTable> QualitiesParameter {
     90      get { return (ILookupParameter<DataTable>)Parameters["Qualities"]; }
     91    }
    5292
    5393    public CovariantParsimonyPressure(bool deserializing) : base(deserializing) { }
     
    5696      Parameters.Add(new ScopeTreeLookupParameter<SymbolicExpressionTree>("SymbolicExpressionTree"));
    5797      Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality"));
     98      Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("AdjustedQuality"));
    5899      Parameters.Add(new LookupParameter<BoolValue>("Maximization"));
    59100      Parameters.Add(new ValueLookupParameter<DoubleValue>("K", new DoubleValue(1.0)));
     101      Parameters.Add(new LookupParameter<IntValue>("Generations"));
     102      Parameters.Add(new ValueLookupParameter<IntValue>("FirstGenerationParameter", new IntValue(5)));
     103      Parameters.Add(new ValueLookupParameter<BoolValue>("AntiOverfit", new BoolValue(false)));
     104      //Parameters.Add(new LookupParameter<DoubleValue>("Current best validation quality"));
     105      //Parameters.Add(new LookupParameter<DoubleValue>("Best solution quality (validation)"));
     106      Parameters.Add(new LookupParameter<DataTable>("Validation Quality"));
     107      Parameters.Add(new LookupParameter<DataTable>("Qualities"));
     108      Parameters.Add(new ValueLookupParameter<IntValue>("GenerationSpan", new IntValue(5)));
     109      Parameters.Add(new ValueLookupParameter<PercentValue>("OverfittingLimit", new PercentValue(5)));
     110      Parameters.Add(new ValueLookupParameter<PercentValue>("ComplexityAdaption", new PercentValue(-5)));
     111      Parameters.Add(new LookupParameter<DoubleValue>("Correlation(Length, AdjustedFitness)"));
     112      Parameters.Add(new LookupParameter<DoubleValue>("Correlation(Fitness, AdjustedFitness)"));
    60113    }
    61114
     
    66119      if (!Parameters.ContainsKey("K"))
    67120        Parameters.Add(new ValueLookupParameter<DoubleValue>("K", new DoubleValue(1.0)));
     121      if (!Parameters.ContainsKey("AdjustedQuality")) {
     122        Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("AdjustedQuality"));
     123      }
     124      if (!Parameters.ContainsKey("Generations")) {
     125        Parameters.Add(new LookupParameter<IntValue>("Generations"));
     126      }
     127      if (!Parameters.ContainsKey("FirstGenerationParameter")) {
     128        Parameters.Add(new ValueLookupParameter<IntValue>("FirstGenerationParameter", new IntValue(5)));
     129      }
     130      if (!Parameters.ContainsKey("AntiOverfit")) {
     131        Parameters.Add(new ValueLookupParameter<BoolValue>("AntiOverfit", new BoolValue(false)));
     132      }
     133      //if (!Parameters.ContainsKey("Current best validation quality")) {
     134      //  Parameters.Add(new LookupParameter<DoubleValue>("Current best validation quality"));
     135      //}
     136      //if (!Parameters.ContainsKey("Best solution quality (validation)")) {
     137      //  Parameters.Add(new LookupParameter<DoubleValue>("Best solution quality (validation)"));
     138      //}
     139      if (!Parameters.ContainsKey("Correlation(Length, AdjustedFitness)")) {
     140        Parameters.Add(new LookupParameter<DoubleValue>("Correlation(Length, AdjustedFitness)"));
     141      }
     142      if (!Parameters.ContainsKey("Correlation(Fitness, AdjustedFitness)")) {
     143        Parameters.Add(new LookupParameter<DoubleValue>("Correlation(Fitness, AdjustedFitness)"));
     144      }
     145      if (!Parameters.ContainsKey("Validation Quality")) {
     146        Parameters.Add(new LookupParameter<DataTable>("Validation Quality"));
     147      }
     148      if (!Parameters.ContainsKey("Qualities")) {
     149        Parameters.Add(new LookupParameter<DataTable>("Qualities"));
     150      }
     151      if (!Parameters.ContainsKey("GenerationSpan")) {
     152        Parameters.Add(new ValueLookupParameter<IntValue>("GenerationSpan", new IntValue(5)));
     153      }
     154      if (!Parameters.ContainsKey("OverfittingLimit")) {
     155        Parameters.Add(new ValueLookupParameter<PercentValue>("OverfittingLimit", new PercentValue(5)));
     156      }
     157      if (!Parameters.ContainsKey("ComplexityAdaption")) {
     158        Parameters.Add(new ValueLookupParameter<PercentValue>("ComplexityAdaption", new PercentValue(-5)));
     159      }
    68160    }
    69161
    70162    public override IOperation Apply() {
    71       var trees = SymbolicExpressionTreeParameter.ActualValue;
    72       var qualities = QualityParameter.ActualValue;
    73       var lengths = from tree in trees
    74                     select tree.Size;
    75       double k = KParameter.ActualValue.Value;
    76 
    77       // calculate cov(f, l) and cov(l, l^k)
    78       OnlineCovarianceEvaluator lengthFitnessCovEvaluator = new OnlineCovarianceEvaluator();
    79       OnlineCovarianceEvaluator lengthAdjLengthCovEvaluator = new OnlineCovarianceEvaluator();
    80       var lengthEnumerator = lengths.GetEnumerator();
    81       var qualityEnumerator = qualities.GetEnumerator();
    82       while (lengthEnumerator.MoveNext() & qualityEnumerator.MoveNext()) {
    83         double fitness = qualityEnumerator.Current.Value;
    84         if (!MaximizationParameter.ActualValue.Value) {
    85           // use f = 1 / (1 + quality) for minimization problems
    86           fitness = 1.0 / (1.0 + fitness);
     163      ItemArray<SymbolicExpressionTree> trees = SymbolicExpressionTreeParameter.ActualValue;
     164      ItemArray<DoubleValue> qualities = QualityParameter.ActualValue;
     165      // always apply Parsimony pressure if anti-overfit is false
     166      // otherwise appliy PP only when we are currently overfitting
     167      if (GenerationsParameter.ActualValue != null && GenerationsParameter.ActualValue.Value >= FirstGenerationParameter.ActualValue.Value &&
     168         (AntiOverfitParameter.ActualValue.Value == false || IsOverfitting())) {
     169        var lengths = from tree in trees
     170                      select tree.Size;
     171        double k = KParameter.ActualValue.Value;
     172
     173        // calculate cov(f, l) and cov(l, l^k)
     174        OnlineCovarianceEvaluator lengthFitnessCovEvaluator = new OnlineCovarianceEvaluator();
     175        OnlineCovarianceEvaluator lengthAdjLengthCovEvaluator = new OnlineCovarianceEvaluator();
     176        OnlineMeanAndVarianceCalculator lengthMeanCalculator = new OnlineMeanAndVarianceCalculator();
     177        OnlineMeanAndVarianceCalculator fitnessMeanCalculator = new OnlineMeanAndVarianceCalculator();
     178        OnlineMeanAndVarianceCalculator adjLengthMeanCalculator = new OnlineMeanAndVarianceCalculator();
     179        var lengthEnumerator = lengths.GetEnumerator();
     180        var qualityEnumerator = qualities.GetEnumerator();
     181        while (lengthEnumerator.MoveNext() & qualityEnumerator.MoveNext()) {
     182          double fitness = qualityEnumerator.Current.Value;
     183          if (!MaximizationParameter.ActualValue.Value) {
     184            // use f = 1 / (1 + quality) for minimization problems
     185            fitness = 1.0 / (1.0 + fitness);
     186          }
     187          lengthFitnessCovEvaluator.Add(lengthEnumerator.Current, fitness);
     188          lengthAdjLengthCovEvaluator.Add(lengthEnumerator.Current, Math.Pow(lengthEnumerator.Current, k));
     189          lengthMeanCalculator.Add(lengthEnumerator.Current);
     190          fitnessMeanCalculator.Add(fitness);
     191          adjLengthMeanCalculator.Add(Math.Pow(lengthEnumerator.Current, k));
    87192        }
    88         lengthFitnessCovEvaluator.Add(lengthEnumerator.Current, fitness);
    89         lengthAdjLengthCovEvaluator.Add(lengthEnumerator.Current, Math.Pow(lengthEnumerator.Current, k));
    90       }
    91 
    92       // c = cov(l, f) / cov(l, l^k)
    93       double c = lengthFitnessCovEvaluator.Covariance / lengthAdjLengthCovEvaluator.Covariance;
    94 
    95       // adjust fitness
     193
     194        double sizeAdaption = lengthMeanCalculator.Mean * ComplexityAdaptionParameter.ActualValue.Value;
     195        if (sizeAdaption < 0) sizeAdaption = Math.Floor(sizeAdaption);
     196        else sizeAdaption = Math.Ceiling(sizeAdaption);
     197        double g = lengthMeanCalculator.Mean + sizeAdaption;
     198
     199        //            cov(l, f) - (g(t+1) - mu(t)) avgF
     200        // c(t) =  --------------------------------------------
     201        //           cov(l, l^k) - (g(t+1) - mu(t)) E[l^k]
     202        double c = lengthFitnessCovEvaluator.Covariance - (g - lengthMeanCalculator.Mean) * fitnessMeanCalculator.Mean;
     203        c /= lengthAdjLengthCovEvaluator.Covariance - (g - lengthMeanCalculator.Mean) * adjLengthMeanCalculator.Mean;
     204
     205        // adjust fitness
     206        bool maximization = MaximizationParameter.ActualValue.Value;
     207
     208        lengthEnumerator = lengths.GetEnumerator();
     209        qualityEnumerator = qualities.GetEnumerator();
     210        int i = 0;
     211        ItemArray<DoubleValue> adjQualities = new ItemArray<DoubleValue>(qualities.Length);
     212
     213        while (lengthEnumerator.MoveNext() & qualityEnumerator.MoveNext()) {
     214          adjQualities[i++] = new DoubleValue(qualityEnumerator.Current.Value - c * Math.Pow(lengthEnumerator.Current, k));
     215        }
     216        AdjustedQualityParameter.ActualValue = adjQualities;
     217        double[] lengthArr = lengths.Select(x => (double)x).ToArray<double>();
     218
     219        double[] adjFitess = (from f in AdjustedQualityParameter.ActualValue
     220                              select f.Value).ToArray<double>();
     221        double[] fitnessArr = (from f in QualityParameter.ActualValue
     222                               let normFit = maximization ? f.Value : 1.0 / (1.0 + f.Value)
     223                               select normFit).ToArray<double>();
     224
     225        LengthCorrelationParameter.ActualValue = new DoubleValue(alglib.correlation.spearmanrankcorrelation(lengthArr, adjFitess, lengthArr.Length));
     226        FitnessCorrelationParameter.ActualValue = new DoubleValue(alglib.correlation.spearmanrankcorrelation(fitnessArr, adjFitess, lengthArr.Length));
     227
     228      } else {
     229        // adjusted fitness is equal to fitness
     230        AdjustedQualityParameter.ActualValue = (ItemArray<DoubleValue>)QualityParameter.ActualValue.Clone();
     231        FitnessCorrelationParameter.ActualValue = new DoubleValue(1.0);
     232
     233        double[] lengths = (from tree in trees
     234                            select (double)tree.Size).ToArray<double>();
     235
     236        double[] fitess = (from f in AdjustedQualityParameter.ActualValue
     237                           select f.Value).ToArray<double>();
     238
     239        LengthCorrelationParameter.ActualValue = new DoubleValue(alglib.correlation.spearmanrankcorrelation(lengths, fitess, lengths.Length));
     240      }
     241      return base.Apply();
     242    }
     243
     244    private bool IsOverfitting() {
    96245      bool maximization = MaximizationParameter.ActualValue.Value;
    97 
    98       lengthEnumerator = lengths.GetEnumerator();
    99       qualityEnumerator = qualities.GetEnumerator();
    100       while (lengthEnumerator.MoveNext() & qualityEnumerator.MoveNext()) {
    101         qualityEnumerator.Current.Value = qualityEnumerator.Current.Value - c * Math.Pow(lengthEnumerator.Current, k);
    102       }
    103 
    104       return base.Apply();
     246      DataTable trainingQualities = QualitiesParameter.ActualValue;
     247      DataTable validationQualities = ValidationQualityParameter.ActualValue;
     248      int genSpan = GenerationSpanParameter.ActualValue.Value;
     249      if (validationQualities == null || trainingQualities == null) return false;
     250      if (validationQualities.Rows["Best solution quality (validation)"].Values.Count < genSpan) return false;
     251
     252      IEnumerable<double> bestTrainingQualities = trainingQualities.Rows["CurrentBestQuality"].Values;
     253      IEnumerable<double> bestValidationQualities = validationQualities.Rows["Current best validation quality"].Values;
     254
     255      double trainingAvg = bestTrainingQualities.Reverse().Take(genSpan).Average();
     256      double validationAvg = bestValidationQualities.Reverse().Take(genSpan).Average();
     257
     258      double maxPercentDiff = OverfittingLimitParameter.ActualValue.Value;
     259
     260      double percentDiff = maximization ? trainingAvg / validationAvg - 1 : validationAvg / trainingAvg - 1;
     261      return percentDiff > maxPercentDiff;
    105262    }
    106263  }
Note: See TracChangeset for help on using the changeset viewer.