Free cookie consent management tool by TermsFeed Policy Generator

source: branches/GP.Grammar.Editor/HeuristicLab.Problems.DataAnalysis.Classification/3.3/Symbolic/Analyzer/ValidationBestSymbolicClassificationSolutionAnalyzer.cs @ 6934

Last change on this file since 6934 was 5863, checked in by mkommend, 14 years ago

#1418: Added NonDiscoverableType attribute to outdated analyzers.

File size: 22.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System.Collections.Generic;
23using System.Linq;
24using HeuristicLab.Analysis;
25using HeuristicLab.Common;
26using HeuristicLab.Core;
27using HeuristicLab.Data;
28using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
29using HeuristicLab.Operators;
30using HeuristicLab.Optimization;
31using HeuristicLab.Parameters;
32using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
33using HeuristicLab.PluginInfrastructure;
34using HeuristicLab.Problems.DataAnalysis.Regression.Symbolic;
35using HeuristicLab.Problems.DataAnalysis.Regression.Symbolic.Analyzers;
36using HeuristicLab.Problems.DataAnalysis.Symbolic;
37
38namespace HeuristicLab.Problems.DataAnalysis.Classification {
39  [Item("ValidationBestSymbolicClassificationSolutionAnalyzer", "An operator that analyzes the validation best symbolic classification solution.")]
40  [StorableClass]
41  [NonDiscoverableType]
42  public class ValidationBestSymbolicClassificationSolutionAnalyzer : SingleSuccessorOperator, ISymbolicClassificationAnalyzer {
43    private const string MaximizationParameterName = "Maximization";
44    private const string GenerationsParameterName = "Generations";
45    private const string RandomParameterName = "Random";
46    private const string SymbolicExpressionTreeParameterName = "SymbolicExpressionTree";
47    private const string SymbolicExpressionTreeInterpreterParameterName = "SymbolicExpressionTreeInterpreter";
48
49    private const string ClassificationProblemDataParameterName = "ClassificationProblemData";
50    private const string EvaluatorParameterName = "Evaluator";
51    private const string ValidationSamplesStartParameterName = "SamplesStart";
52    private const string ValidationSamplesEndParameterName = "SamplesEnd";
53    private const string RelativeNumberOfEvaluatedSamplesParameterName = "RelativeNumberOfEvaluatedSamples";
54    private const string UpperEstimationLimitParameterName = "UpperEstimationLimit";
55    private const string LowerEstimationLimitParameterName = "LowerEstimationLimit";
56    private const string CalculateSolutionComplexityParameterName = "CalculateSolutionComplexity";
57    private const string ApplyLinearScalingParameterName = "ApplyLinearScaling";
58
59    private const string ResultsParameterName = "Results";
60    private const string BestValidationQualityParameterName = "Best validation quality";
61    private const string BestValidationSolutionParameterName = "Best validation solution";
62    private const string BestSolutionAccuracyTrainingParameterName = "Best solution accuracy (training)";
63    private const string BestSolutionAccuracyTestParameterName = "Best solution accuracy (test)";
64    private const string BestSolutionLengthParameterName = "Best solution length (on validation set)";
65    private const string BestSolutionHeightParameterName = "Best solution height (on validation set)";
66    private const string VariableFrequenciesParameterName = "VariableFrequencies";
67
68    #region parameter properties
69    public ILookupParameter<BoolValue> MaximizationParameter {
70      get { return (ILookupParameter<BoolValue>)Parameters[MaximizationParameterName]; }
71    }
72    public ILookupParameter<IntValue> GenerationsParameter {
73      get { return (ILookupParameter<IntValue>)Parameters[GenerationsParameterName]; }
74    }
75    public ILookupParameter<IRandom> RandomParameter {
76      get { return (ILookupParameter<IRandom>)Parameters[RandomParameterName]; }
77    }
78    public ScopeTreeLookupParameter<SymbolicExpressionTree> SymbolicExpressionTreeParameter {
79      get { return (ScopeTreeLookupParameter<SymbolicExpressionTree>)Parameters[SymbolicExpressionTreeParameterName]; }
80    }
81    public IValueLookupParameter<ISymbolicExpressionTreeInterpreter> SymbolicExpressionTreeInterpreterParameter {
82      get { return (IValueLookupParameter<ISymbolicExpressionTreeInterpreter>)Parameters[SymbolicExpressionTreeInterpreterParameterName]; }
83    }
84    public ILookupParameter<ClassificationProblemData> ClassificationProblemDataParameter {
85      get { return (ILookupParameter<ClassificationProblemData>)Parameters[ClassificationProblemDataParameterName]; }
86    }
87    public ILookupParameter<ISymbolicClassificationEvaluator> EvaluatorParameter {
88      get { return (ILookupParameter<ISymbolicClassificationEvaluator>)Parameters[EvaluatorParameterName]; }
89    }
90    public IValueLookupParameter<IntValue> ValidationSamplesStartParameter {
91      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesStartParameterName]; }
92    }
93    public IValueLookupParameter<IntValue> ValidationSamplesEndParameter {
94      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesEndParameterName]; }
95    }
96    public IValueParameter<PercentValue> RelativeNumberOfEvaluatedSamplesParameter {
97      get { return (IValueParameter<PercentValue>)Parameters[RelativeNumberOfEvaluatedSamplesParameterName]; }
98    }
99    public IValueLookupParameter<DoubleValue> UpperEstimationLimitParameter {
100      get { return (IValueLookupParameter<DoubleValue>)Parameters[UpperEstimationLimitParameterName]; }
101    }
102    public IValueLookupParameter<DoubleValue> LowerEstimationLimitParameter {
103      get { return (IValueLookupParameter<DoubleValue>)Parameters[LowerEstimationLimitParameterName]; }
104    }
105    public IValueLookupParameter<BoolValue> ApplyLinearScalingParameter {
106      get { return (IValueLookupParameter<BoolValue>)Parameters[ApplyLinearScalingParameterName]; }
107    }
108    public ILookupParameter<DataTable> VariableFrequenciesParameter {
109      get { return (ILookupParameter<DataTable>)Parameters[VariableFrequenciesParameterName]; }
110    }
111    public IValueParameter<BoolValue> CalculateSolutionComplexityParameter {
112      get { return (IValueParameter<BoolValue>)Parameters[CalculateSolutionComplexityParameterName]; }
113    }
114
115    public ILookupParameter<ResultCollection> ResultsParameter {
116      get { return (ILookupParameter<ResultCollection>)Parameters[ResultsParameterName]; }
117    }
118    public ILookupParameter<DoubleValue> BestValidationQualityParameter {
119      get { return (ILookupParameter<DoubleValue>)Parameters[BestValidationQualityParameterName]; }
120    }
121    public ILookupParameter<SymbolicClassificationSolution> BestValidationSolutionParameter {
122      get { return (ILookupParameter<SymbolicClassificationSolution>)Parameters[BestValidationSolutionParameterName]; }
123    }
124    public ILookupParameter<DoubleValue> BestSolutionAccuracyTrainingParameter {
125      get { return (ILookupParameter<DoubleValue>)Parameters[BestSolutionAccuracyTrainingParameterName]; }
126    }
127    public ILookupParameter<DoubleValue> BestSolutionAccuracyTestParameter {
128      get { return (ILookupParameter<DoubleValue>)Parameters[BestSolutionAccuracyTestParameterName]; }
129    }
130    public ILookupParameter<IntValue> BestSolutionLengthParameter {
131      get { return (ILookupParameter<IntValue>)Parameters[BestSolutionLengthParameterName]; }
132    }
133    public ILookupParameter<IntValue> BestSolutionHeightParameter {
134      get { return (ILookupParameter<IntValue>)Parameters[BestSolutionHeightParameterName]; }
135    }
136    #endregion
137    #region properties
138    public BoolValue Maximization {
139      get { return MaximizationParameter.ActualValue; }
140    }
141    public IntValue Generations {
142      get { return GenerationsParameter.ActualValue; }
143    }
144    public IRandom Random {
145      get { return RandomParameter.ActualValue; }
146    }
147    public ItemArray<SymbolicExpressionTree> SymbolicExpressionTree {
148      get { return SymbolicExpressionTreeParameter.ActualValue; }
149    }
150    public ISymbolicExpressionTreeInterpreter SymbolicExpressionTreeInterpreter {
151      get { return SymbolicExpressionTreeInterpreterParameter.ActualValue; }
152    }
153
154    public ClassificationProblemData ClassificationProblemData {
155      get { return ClassificationProblemDataParameter.ActualValue; }
156    }
157    public ISymbolicClassificationEvaluator Evaluator {
158      get { return EvaluatorParameter.ActualValue; }
159    }
160    public IntValue ValidiationSamplesStart {
161      get { return ValidationSamplesStartParameter.ActualValue; }
162    }
163    public IntValue ValidationSamplesEnd {
164      get { return ValidationSamplesEndParameter.ActualValue; }
165    }
166    public PercentValue RelativeNumberOfEvaluatedSamples {
167      get { return RelativeNumberOfEvaluatedSamplesParameter.Value; }
168    }
169    public DoubleValue UpperEstimationLimit {
170      get { return UpperEstimationLimitParameter.ActualValue; }
171    }
172    public DoubleValue LowerEstimationLimit {
173      get { return LowerEstimationLimitParameter.ActualValue; }
174    }
175    public BoolValue ApplyLinearScaling {
176      get { return ApplyLinearScalingParameter.ActualValue; }
177      set { ApplyLinearScalingParameter.ActualValue = value; }
178    }
179    public DataTable VariableFrequencies {
180      get { return VariableFrequenciesParameter.ActualValue; }
181    }
182    public BoolValue CalculateSolutionComplexity {
183      get { return CalculateSolutionComplexityParameter.Value; }
184      set { CalculateSolutionComplexityParameter.Value = value; }
185    }
186
187    public ResultCollection Results {
188      get { return ResultsParameter.ActualValue; }
189    }
190    public DoubleValue BestValidationQuality {
191      get { return BestValidationQualityParameter.ActualValue; }
192      protected set { BestValidationQualityParameter.ActualValue = value; }
193    }
194    public SymbolicClassificationSolution BestValidationSolution {
195      get { return BestValidationSolutionParameter.ActualValue; }
196      protected set { BestValidationSolutionParameter.ActualValue = value; }
197    }
198    public DoubleValue BestSolutionAccuracyTraining {
199      get { return BestSolutionAccuracyTrainingParameter.ActualValue; }
200      protected set { BestSolutionAccuracyTrainingParameter.ActualValue = value; }
201    }
202    public DoubleValue BestSolutionAccuracyTest {
203      get { return BestSolutionAccuracyTestParameter.ActualValue; }
204      protected set { BestSolutionAccuracyTestParameter.ActualValue = value; }
205    }
206    public IntValue BestSolutionLength {
207      get { return BestSolutionLengthParameter.ActualValue; }
208      set { BestSolutionLengthParameter.ActualValue = value; }
209    }
210    public IntValue BestSolutionHeight {
211      get { return BestSolutionHeightParameter.ActualValue; }
212      set { BestSolutionHeightParameter.ActualValue = value; }
213    }
214    #endregion
215
216    [StorableConstructor]
217    protected ValidationBestSymbolicClassificationSolutionAnalyzer(bool deserializing) : base(deserializing) { }
218    protected ValidationBestSymbolicClassificationSolutionAnalyzer(ValidationBestSymbolicClassificationSolutionAnalyzer original, Cloner cloner)
219      : base(original, cloner) {
220    }
221    public ValidationBestSymbolicClassificationSolutionAnalyzer()
222      : base() {
223      Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
224      Parameters.Add(new LookupParameter<IntValue>(GenerationsParameterName, "The number of generations calculated so far."));
225      Parameters.Add(new LookupParameter<IRandom>(RandomParameterName, "The random generator to use."));
226      Parameters.Add(new ScopeTreeLookupParameter<SymbolicExpressionTree>(SymbolicExpressionTreeParameterName, "The symbolic expression trees to analyze."));
227      Parameters.Add(new ValueLookupParameter<ISymbolicExpressionTreeInterpreter>(SymbolicExpressionTreeInterpreterParameterName, "The interpreter that should be used for the analysis of symbolic expression trees."));
228      Parameters.Add(new LookupParameter<ClassificationProblemData>(ClassificationProblemDataParameterName, "The problem data for which the symbolic expression tree is a solution."));
229      Parameters.Add(new LookupParameter<ISymbolicClassificationEvaluator>(EvaluatorParameterName, "The evaluator which should be used to evaluate the solution on the validation set."));
230      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesStartParameterName, "The first index of the validation partition of the data set."));
231      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesEndParameterName, "The last index of the validation partition of the data set."));
232      Parameters.Add(new ValueParameter<PercentValue>(RelativeNumberOfEvaluatedSamplesParameterName, "The relative number of samples of the dataset partition, which should be randomly chosen for evaluation between the start and end index.", new PercentValue(1)));
233      Parameters.Add(new ValueLookupParameter<DoubleValue>(UpperEstimationLimitParameterName, "The upper estimation limit that was set for the evaluation of the symbolic expression trees."));
234      Parameters.Add(new ValueLookupParameter<DoubleValue>(LowerEstimationLimitParameterName, "The lower estimation limit that was set for the evaluation of the symbolic expression trees."));
235      Parameters.Add(new LookupParameter<DataTable>(VariableFrequenciesParameterName, "The variable frequencies table to use for the calculation of variable impacts"));
236      Parameters.Add(new ValueParameter<BoolValue>(CalculateSolutionComplexityParameterName, "Determines if the length and height of the validation best solution should be calculated.", new BoolValue(true)));
237      Parameters.Add(new ValueLookupParameter<BoolValue>(ApplyLinearScalingParameterName, "The switch determines if the best solution should be linearly scaled on the whole training set.", new BoolValue(false)));
238
239      Parameters.Add(new ValueLookupParameter<ResultCollection>(ResultsParameterName, "The results collection where the analysis values should be stored."));
240      Parameters.Add(new LookupParameter<DoubleValue>(BestValidationQualityParameterName, "The validation quality of the best solution in the current run."));
241      Parameters.Add(new LookupParameter<SymbolicClassificationSolution>(BestValidationSolutionParameterName, "The best solution on the validation data found in the current run."));
242      Parameters.Add(new LookupParameter<DoubleValue>(BestSolutionAccuracyTrainingParameterName, "The training accuracy of the best solution."));
243      Parameters.Add(new LookupParameter<DoubleValue>(BestSolutionAccuracyTestParameterName, "The test accuracy of the best solution."));
244      Parameters.Add(new LookupParameter<IntValue>(BestSolutionLengthParameterName, "The length of the best symbolic classification solution."));
245      Parameters.Add(new LookupParameter<IntValue>(BestSolutionHeightParameterName, "The height of the best symbolic classification solution."));
246    }
247
248    [StorableHook(HookType.AfterDeserialization)]
249    private void AfterDeserialization() {
250      if (!Parameters.ContainsKey(CalculateSolutionComplexityParameterName)) {
251        Parameters.Add(new ValueParameter<BoolValue>(CalculateSolutionComplexityParameterName, "Determines if the length and height of the validation best solution should be calculated.", new BoolValue(true)));
252      }
253      if (!Parameters.ContainsKey(BestSolutionLengthParameterName)) {
254        Parameters.Add(new LookupParameter<IntValue>(BestSolutionLengthParameterName, "The length of the best symbolic classification solution."));
255      }
256      if (!Parameters.ContainsKey(BestSolutionHeightParameterName)) {
257        Parameters.Add(new LookupParameter<IntValue>(BestSolutionHeightParameterName, "The height of the best symbolic classification solution."));
258      }
259      if (!Parameters.ContainsKey(ApplyLinearScalingParameterName)) {
260        Parameters.Add(new ValueLookupParameter<BoolValue>(ApplyLinearScalingParameterName, "The switch determines if the best solution should be linearly scaled on the whole training set.", new BoolValue(false)));
261      }
262    }
263
264    public override IDeepCloneable Clone(Cloner cloner) {
265      return new ValidationBestSymbolicClassificationSolutionAnalyzer(this, cloner);
266    }
267
268    public override IOperation Apply() {
269      var trees = SymbolicExpressionTree;
270      string targetVariable = ClassificationProblemData.TargetVariable.Value;
271
272      // select a random subset of rows in the validation set
273      int validationStart = ValidiationSamplesStart.Value;
274      int validationEnd = ValidationSamplesEnd.Value;
275      int seed = Random.Next();
276      int count = (int)((validationEnd - validationStart) * RelativeNumberOfEvaluatedSamples.Value);
277      if (count == 0) count = 1;
278      IEnumerable<int> rows = RandomEnumerable.SampleRandomNumbers(seed, validationStart, validationEnd, count)
279         .Where(row => row < ClassificationProblemData.TestSamplesStart.Value || ClassificationProblemData.TestSamplesEnd.Value <= row);
280
281      double upperEstimationLimit = UpperEstimationLimit != null ? UpperEstimationLimit.Value : double.PositiveInfinity;
282      double lowerEstimationLimit = LowerEstimationLimit != null ? LowerEstimationLimit.Value : double.NegativeInfinity;
283
284      double bestQuality = Maximization.Value ? double.NegativeInfinity : double.PositiveInfinity;
285      SymbolicExpressionTree bestTree = null;
286
287      foreach (var tree in trees) {
288        double quality = Evaluator.Evaluate(SymbolicExpressionTreeInterpreter, tree,
289          lowerEstimationLimit, upperEstimationLimit, ClassificationProblemData.Dataset,
290          targetVariable, rows);
291
292        if ((Maximization.Value && quality > bestQuality) ||
293            (!Maximization.Value && quality < bestQuality)) {
294          bestQuality = quality;
295          bestTree = tree;
296        }
297      }
298
299      // if the best validation tree is better than the current best solution => update
300      bool newBest =
301        BestValidationQuality == null ||
302        (Maximization.Value && bestQuality > BestValidationQuality.Value) ||
303        (!Maximization.Value && bestQuality < BestValidationQuality.Value);
304      if (newBest) {
305        if (ApplyLinearScaling.Value) {
306          double alpha, beta;
307          SymbolicRegressionScaledMeanSquaredErrorEvaluator.Calculate(SymbolicExpressionTreeInterpreter, bestTree,
308            lowerEstimationLimit, upperEstimationLimit,
309            ClassificationProblemData.Dataset, targetVariable,
310            ClassificationProblemData.TrainingIndizes, out beta, out alpha);
311
312          // scale tree for solution
313          bestTree = SymbolicRegressionSolutionLinearScaler.Scale(bestTree, alpha, beta);
314        }
315        var model = new SymbolicRegressionModel((ISymbolicExpressionTreeInterpreter)SymbolicExpressionTreeInterpreter.Clone(),
316          bestTree);
317
318        if (BestValidationSolution == null) {
319          BestValidationSolution = new SymbolicClassificationSolution(ClassificationProblemData, model, LowerEstimationLimit.Value, UpperEstimationLimit.Value);
320          BestValidationSolution.Name = BestValidationSolutionParameterName;
321          BestValidationSolution.Description = "Best solution on validation partition found over the whole run.";
322          BestValidationQuality = new DoubleValue(bestQuality);
323        } else {
324          BestValidationSolution.Model = model;
325          BestValidationQuality.Value = bestQuality;
326        }
327
328        UpdateBestSolutionResults();
329      }
330      return base.Apply();
331    }
332
333    private void UpdateBestSolutionResults() {
334      if (CalculateSolutionComplexity.Value) {
335        BestSolutionLength = new IntValue(BestValidationSolution.Model.SymbolicExpressionTree.Size);
336        BestSolutionHeight = new IntValue(BestValidationSolution.Model.SymbolicExpressionTree.Height);
337        if (!Results.ContainsKey(BestSolutionLengthParameterName)) {
338          Results.Add(new Result(BestSolutionLengthParameterName, "Length of the best solution on the validation set", new IntValue()));
339          Results.Add(new Result(BestSolutionHeightParameterName, "Height of the best solution on the validation set", new IntValue()));
340        }
341        Results[BestSolutionLengthParameterName].Value = BestSolutionLength;
342        Results[BestSolutionHeightParameterName].Value = BestSolutionHeight;
343      }
344
345      BestSymbolicRegressionSolutionAnalyzer.UpdateBestSolutionResults(BestValidationSolution, ClassificationProblemData, Results, Generations, VariableFrequencies);
346
347      IEnumerable<double> trainingValues = ClassificationProblemData.Dataset.GetEnumeratedVariableValues(
348        ClassificationProblemData.TargetVariable.Value, ClassificationProblemData.TrainingIndizes);
349      IEnumerable<double> testValues = ClassificationProblemData.Dataset.GetEnumeratedVariableValues(
350        ClassificationProblemData.TargetVariable.Value, ClassificationProblemData.TestIndizes);
351
352      OnlineAccuracyEvaluator accuracyEvaluator = new OnlineAccuracyEvaluator();
353      var originalEnumerator = trainingValues.GetEnumerator();
354      var estimatedEnumerator = BestValidationSolution.EstimatedTrainingClassValues.GetEnumerator();
355      while (originalEnumerator.MoveNext() & estimatedEnumerator.MoveNext()) {
356        accuracyEvaluator.Add(originalEnumerator.Current, estimatedEnumerator.Current);
357      }
358      double trainingAccuracy = accuracyEvaluator.Accuracy;
359
360      accuracyEvaluator.Reset();
361      originalEnumerator = testValues.GetEnumerator();
362      estimatedEnumerator = BestValidationSolution.EstimatedTestClassValues.GetEnumerator();
363      while (originalEnumerator.MoveNext() & estimatedEnumerator.MoveNext()) {
364        accuracyEvaluator.Add(originalEnumerator.Current, estimatedEnumerator.Current);
365      }
366      double testAccuracy = accuracyEvaluator.Accuracy;
367
368      if (!Results.ContainsKey(BestSolutionAccuracyTrainingParameterName)) {
369        BestSolutionAccuracyTraining = new DoubleValue(trainingAccuracy);
370        BestSolutionAccuracyTest = new DoubleValue(testAccuracy);
371
372        Results.Add(new Result(BestSolutionAccuracyTrainingParameterName, BestSolutionAccuracyTraining));
373        Results.Add(new Result(BestSolutionAccuracyTestParameterName, BestSolutionAccuracyTest));
374      } else {
375        BestSolutionAccuracyTraining.Value = trainingAccuracy;
376        BestSolutionAccuracyTest.Value = testAccuracy;
377      }
378    }
379  }
380}
Note: See TracBrowser for help on using the repository browser.