Changeset 4297


Ignore:
Timestamp:
08/23/10 18:54:35 (12 years ago)
Author:
gkronber
Message:

Added output parameter for validation quality to validation analyzer, added input parameter for validation quality to overfitting analyzer, and fixed bugs in pruning operator. #1142

Location:
branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer.cs

    r4272 r4297  
    146146      get { return (ScopeTreeLookupParameter<DoubleValue>)Parameters["Quality"]; }
    147147    }
     148    public ScopeTreeLookupParameter<DoubleValue> ValidationQualityParameter {
     149      get { return (ScopeTreeLookupParameter<DoubleValue>)Parameters["ValidationQuality"]; }
     150    }
    148151
    149152    public ILookupParameter<IntValue> GenerationsParameter {
     
    237240      Parameters.Add(new LookupParameter<SymbolicRegressionSolution>("BestTrainingSolution"));
    238241      Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality"));
     242      Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("ValidationQuality"));
    239243      Parameters.Add(new LookupParameter<IntValue>(GenerationsParameterName, "The number of generations calculated so far."));
    240244      Parameters.Add(new LookupParameter<DoubleValue>(BestSolutionQualityParameterName, "The quality of the best symbolic regression solution."));
     
    267271        Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality"));
    268272      }
     273      if (!Parameters.ContainsKey("ValidationQuality")) {
     274        Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("ValidationQuality"));
     275      }
    269276      #endregion
    270277    }
     
    291298      SymbolicExpressionTree bestTrainingTree = trees[0];
    292299      double bestTrainingQuality = qualities[0].Value;
     300      ItemArray<DoubleValue> validationQualites = new ItemArray<DoubleValue>(qualities.Length);
    293301      for (int i = 0; i < trees.Length; i++) {
    294302        SymbolicExpressionTree tree = trees[i];
     
    297305          ProblemData.Dataset, targetVariable,
    298306         rows);
    299 
     307        validationQualites[i] = new DoubleValue(quality);
    300308        if ((Maximization.Value && quality > bestQuality) ||
    301309            (!Maximization.Value && quality < bestQuality)) {
     
    309317        }
    310318      }
     319      ValidationQualityParameter.ActualValue = validationQualites;
    311320
    312321      var scaledBestTrainingTree = GetScaledTree(bestTrainingTree);
  • branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/OverfittingAnalyzer.cs

    r4275 r4297  
    6161      get { return (ScopeTreeLookupParameter<DoubleValue>)Parameters["Quality"]; }
    6262    }
     63    public ScopeTreeLookupParameter<DoubleValue> ValidationQualityParameter {
     64      get { return (ScopeTreeLookupParameter<DoubleValue>)Parameters["ValidationQuality"]; }
     65    }
    6366    public IValueLookupParameter<ISymbolicExpressionTreeInterpreter> SymbolicExpressionTreeInterpreterParameter {
    6467      get { return (IValueLookupParameter<ISymbolicExpressionTreeInterpreter>)Parameters[SymbolicExpressionTreeInterpreterParameterName]; }
     
    163166      Parameters.Add(new ScopeTreeLookupParameter<SymbolicExpressionTree>(SymbolicExpressionTreeParameterName, "The symbolic expression trees to analyze."));
    164167      Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality"));
     168      Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("ValidationQuality"));
    165169      Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
    166170      Parameters.Add(new ValueLookupParameter<ISymbolicExpressionTreeInterpreter>(SymbolicExpressionTreeInterpreterParameterName, "The interpreter that should be used for the analysis of symbolic expression trees."));
     
    180184      Parameters.Add(new LookupParameter<DoubleValue>("InitialTrainingQuality"));
    181185      Parameters.Add(new LookupParameter<DoubleMatrix>("TrainingAndValidationQualities"));
    182       Parameters.Add(new ValueLookupParameter<DoubleValue>("Percentile", new DoubleValue(0.1)));
     186      Parameters.Add(new ValueLookupParameter<DoubleValue>("Percentile", new DoubleValue(1)));
    183187
    184188    }
     
    202206      }
    203207      if (!Parameters.ContainsKey("Percentile")) {
    204         Parameters.Add(new ValueLookupParameter<DoubleValue>("Percentile", new DoubleValue(0.1)));
     208        Parameters.Add(new ValueLookupParameter<DoubleValue>("Percentile", new DoubleValue(1)));
     209      }
     210      if (!Parameters.ContainsKey("ValidationQuality")) {
     211        Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("ValidationQuality"));
    205212      }
    206213    }
     
    209216      var trees = SymbolicExpressionTree;
    210217      ItemArray<DoubleValue> qualities = QualityParameter.ActualValue;
     218      ItemArray<DoubleValue> validationQualities = ValidationQualityParameter.ActualValue;
    211219
    212220      string targetVariable = ProblemData.TargetVariable.Value;
     
    226234      //SymbolicExpressionTree bestTree = null;
    227235
    228       List<double> validationQualities = new List<double>();
    229       foreach (var tree in trees) {
    230         double quality = Evaluator.Evaluate(SymbolicExpressionTreeInterpreter, tree,
    231           lowerEstimationLimit, upperEstimationLimit,
    232           ProblemData.Dataset, targetVariable,
    233          rows);
    234         validationQualities.Add(quality);
    235         //if ((Maximization.Value && quality > bestQuality) ||
    236         //    (!Maximization.Value && quality < bestQuality)) {
    237         //  bestQuality = quality;
    238         //  bestTree = tree;
    239         //}
    240       }
     236      //List<double> validationQualities = new List<double>();
     237      //foreach (var tree in trees) {
     238      //  double quality = Evaluator.Evaluate(SymbolicExpressionTreeInterpreter, tree,
     239      //    lowerEstimationLimit, upperEstimationLimit,
     240      //    ProblemData.Dataset, targetVariable,
     241      //   rows);
     242      //  validationQualities.Add(quality);
     243      //  //if ((Maximization.Value && quality > bestQuality) ||
     244      //  //    (!Maximization.Value && quality < bestQuality)) {
     245      //  //  bestQuality = quality;
     246      //  //  bestTree = tree;
     247      //  //}
     248      //}
    241249
    242250      //if (RelativeValidationQualityParameter.ActualValue == null) {
    243251      // first call initialize the relative quality using the difference between average training and validation quality
    244252      double avgTrainingQuality = qualities.Select(x => x.Value).Median();
    245       double avgValidationQuality = validationQualities.Median();
     253      double avgValidationQuality = validationQualities.Select(x => x.Value).Median();
    246254
    247255      if (Maximization.Value)
     
    254262      // best first (only for maximization
    255263      var orderedDistinctPairs = (from index in Enumerable.Range(0, qualities.Length)
    256                                   select new { Training = qualities[index].Value, Validation = validationQualities[index] })
    257                                  .Distinct()
     264                                  select new { Training = qualities[index].Value, Validation = validationQualities[index].Value })
    258265                                 .OrderBy(x => -x.Training)
    259266                                 .ToList();
  • branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/SymbolicRegressionTournamentPruning.cs

    r4195 r4297  
    3131using HeuristicLab.Problems.DataAnalysis.Symbolic.Symbols;
    3232using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     33using System;
    3334
    3435namespace HeuristicLab.Problems.DataAnalysis.Regression.Symbolic.Analyzers {
     
    6263      get { return (ScopeTreeLookupParameter<SymbolicExpressionTree>)Parameters[SymbolicExpressionTreeParameterName]; }
    6364    }
     65    public ScopeTreeLookupParameter<DoubleValue> QualityParameter {
     66      get { return (ScopeTreeLookupParameter<DoubleValue>)Parameters["Quality"]; }
     67    }
    6468    public ILookupParameter<DataAnalysisProblemData> DataAnalysisProblemDataParameter {
    6569      get { return (ILookupParameter<DataAnalysisProblemData>)Parameters[DataAnalysisProblemDataParameterName]; }
     
    8084      get { return (IValueLookupParameter<IntValue>)Parameters[SamplesEndParameterName]; }
    8185    }
     86    public IValueLookupParameter<PercentValue> RelativeNumberOfEvaluatedRowsParameters {
     87      get { return (IValueLookupParameter<PercentValue>)Parameters["RelativeNumberOfEvaluatedRows"]; }
     88    }
    8289    public ILookupParameter<ISymbolicRegressionEvaluator> EvaluatorParameter {
    8390      get { return (ILookupParameter<ISymbolicRegressionEvaluator>)Parameters[EvaluatorParameterName]; }
     
    115122    public ILookupParameter<ResultCollection> ResultsParameter {
    116123      get { return (ILookupParameter<ResultCollection>)Parameters[ResultsParameterName]; }
     124    }
     125    public IValueLookupParameter<BoolValue> ApplyPruningParameter {
     126      get { return (IValueLookupParameter<BoolValue>)Parameters["ApplyPruning"]; }
    117127    }
    118128    #endregion
     
    176186    }
    177187    #endregion
     188    [StorableConstructor]
    178189    protected SymbolicRegressionTournamentPruning(bool deserializing) : base(deserializing) { }
    179190    public SymbolicRegressionTournamentPruning()
     
    181192      Parameters.Add(new LookupParameter<IRandom>(RandomParameterName, "A random number generator."));
    182193      Parameters.Add(new ScopeTreeLookupParameter<SymbolicExpressionTree>(SymbolicExpressionTreeParameterName, "The symbolic expression trees to prune."));
     194      Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality"));
    183195      Parameters.Add(new LookupParameter<DataAnalysisProblemData>(DataAnalysisProblemDataParameterName, "The data analysis problem data to use for branch impact evaluation."));
    184196      Parameters.Add(new LookupParameter<ISymbolicExpressionTreeInterpreter>(SymbolicExpressionTreeInterpreterParameterName, "The interpreter to use for node impact evaluation"));
     
    187199      Parameters.Add(new LookupParameter<ISymbolicRegressionEvaluator>(EvaluatorParameterName, "The evaluator that should be used to determine which branches are not relevant."));
    188200      Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
     201      Parameters.Add(new ValueLookupParameter<BoolValue>("ApplyPruning"));
    189202      Parameters.Add(new ValueLookupParameter<DoubleValue>(MaxPruningRatioParameterName, "The maximal relative size of the pruned branch.", new DoubleValue(0.5)));
    190203      Parameters.Add(new ValueLookupParameter<IntValue>(TournamentSizeParameterName, "The number of branches to compare for pruning", new IntValue(10)));
     
    199212      Parameters.Add(new LookupParameter<IntValue>(GenerationParameterName, "The current generation."));
    200213      Parameters.Add(new LookupParameter<ResultCollection>(ResultsParameterName, "The results collection."));
     214      Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeNumberOfEvaluatedRows", new PercentValue(1.0)));
    201215    }
    202216
     
    210224        Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
    211225      }
     226      if (!Parameters.ContainsKey("ApplyPruning")) {
     227        Parameters.Add(new ValueLookupParameter<BoolValue>("ApplyPruning"));
     228      }
     229      if (!Parameters.ContainsKey("Quality")) {
     230        Parameters.Add(new ScopeTreeLookupParameter<DoubleValue>("Quality"));
     231      }
     232      if (!Parameters.ContainsKey("RelativeNumberOfEvaluatedRows")) {
     233        Parameters.Add(new ValueLookupParameter<PercentValue>("RelativeNumberOfEvaluatedRows", new PercentValue(1.0)));
     234      }
     235
    212236      #endregion
    213237    }
     
    215239    public override IOperation Apply() {
    216240      bool pruningCondition =
     241        (ApplyPruningParameter.ActualValue.Value) &&
    217242        (Generation.Value >= FirstPruningGeneration.Value) &&
    218243        ((Generation.Value - FirstPruningGeneration.Value) % PruningFrequency.Value == 0);
     
    222247        double percentileEnd = PopulationPercentileEnd.Value;
    223248        // for each tree in the given percentile
    224         var trees = SymbolicExpressionTree
    225           .Skip((int)(n * percentileStart))
    226           .Take((int)(n * (percentileEnd - percentileStart)));
    227         foreach (var tree in trees) {
    228           Prune(Random, tree, Iterations.Value, TournamentSize.Value,
    229             DataAnalysisProblemData, SamplesStart.Value, SamplesEnd.Value,
     249        ItemArray<SymbolicExpressionTree> trees = SymbolicExpressionTree;
     250        ItemArray<DoubleValue> quality = QualityParameter.ActualValue;
     251        bool maximization = Maximization.Value;
     252        var selectedTrees = (from index in Enumerable.Range(0, n)
     253                             orderby maximization ? -quality[index].Value : quality[index].Value
     254                             select new { Tree = trees[index], Quality = quality[index] })
     255                                                            .Skip((int)(n * percentileStart))
     256                                                            .Take((int)(n * (percentileEnd - percentileStart)));
     257        foreach (var pair in selectedTrees) {
     258          Prune(Random, pair.Tree, pair.Quality, Iterations.Value, TournamentSize.Value,
     259            DataAnalysisProblemData, SamplesStart.Value, SamplesEnd.Value, RelativeNumberOfEvaluatedRowsParameters.ActualValue.Value,
    230260            SymbolicExpressionTreeInterpreter, Evaluator, Maximization.Value,
    231261            LowerEstimationLimit.Value, UpperEstimationLimit.Value,
     
    236266    }
    237267
    238     public static void Prune(IRandom random, SymbolicExpressionTree tree, int iterations, int tournamentSize,
    239       DataAnalysisProblemData problemData, int samplesStart, int samplesEnd,
     268    public static void Prune(IRandom random, SymbolicExpressionTree tree, DoubleValue quality, int iterations, int tournamentSize,
     269      DataAnalysisProblemData problemData, int samplesStart, int samplesEnd, double relativeNumberOfEvaluatedRows,
    240270      ISymbolicExpressionTreeInterpreter interpreter, ISymbolicRegressionEvaluator evaluator, bool maximization,
    241271      double lowerEstimationLimit, double upperEstimationLimit,
    242272      double maxPruningRatio, double qualityGainWeight) {
    243       IEnumerable<int> rows = Enumerable.Range(samplesStart, samplesEnd - samplesStart);
     273
     274      IEnumerable<int> rows = RandomEnumerable.SampleRandomNumbers(samplesStart, samplesEnd, (int)Math.Ceiling((samplesEnd - samplesStart) * relativeNumberOfEvaluatedRows));
    244275      int originalSize = tree.Size;
    245       double originalQuality = evaluator.Evaluate(interpreter, tree,
    246         lowerEstimationLimit, upperEstimationLimit, problemData.Dataset, problemData.TargetVariable.Value, rows);
    247276
    248277      int minPrunedSize = (int)(originalSize * (1 - maxPruningRatio));
    249 
    250278      // tree for branch evaluation
    251279      SymbolicExpressionTree templateTree = (SymbolicExpressionTree)tree.Clone();
     
    253281
    254282      SymbolicExpressionTree prunedTree = tree;
     283      double currentQuality = quality.Value;
    255284      for (int iteration = 0; iteration < iterations; iteration++) {
    256285        SymbolicExpressionTree iterationBestTree = prunedTree;
     
    261290          var clonedTree = (SymbolicExpressionTree)prunedTree.Clone();
    262291          int clonedTreeSize = clonedTree.Size;
    263           var prunePoints = (from node in clonedTree.IterateNodesPostfix()
     292          var prunePoints = (from node in clonedTree.Root.SubTrees[0].IterateNodesPostfix()
    264293                             from subTree in node.SubTrees
    265294                             let subTreeSize = subTree.GetSize()
     
    280309
    281310            double prunedQuality = evaluator.Evaluate(interpreter, clonedTree,
    282         lowerEstimationLimit, upperEstimationLimit, problemData.Dataset, problemData.TargetVariable.Value, Enumerable.Range(samplesStart, samplesEnd - samplesStart));
     311        lowerEstimationLimit, upperEstimationLimit, problemData.Dataset, problemData.TargetVariable.Value, rows);
    283312            double prunedSize = clonedTree.Size;
    284313            // deteriation in quality:
     
    289318            //      R²  : newR² < origR²   (deteriation) => prefer smaller deteriation
    290319            //      R²  : minimize: origR² / newR²
    291             double qualityDeteriation = maximization ? originalQuality / prunedQuality : prunedQuality / originalQuality;
     320            double qualityDeteriation = maximization ? quality.Value / prunedQuality : prunedQuality / quality.Value;
    292321            // size of the pruned tree is always smaller than the size of the original tree
    293322            // same change in quality => prefer pruning operation that removes a larger tree
     
    297326              bestGain = gain;
    298327              iterationBestTree = clonedTree;
     328              currentQuality = prunedQuality;
    299329            }
    300330          }
     
    302332        prunedTree = iterationBestTree;
    303333      }
     334
     335      quality.Value = currentQuality;
    304336      tree.Root = prunedTree.Root;
    305337    }
Note: See TracChangeset for help on using the changeset viewer.