Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis.Classification/3.3/Symbolic/Analyzer/ValidationBestSymbolicClassificationSolutionAnalyzer.cs @ 7214

Last change on this file since 7214 was 7214, checked in by ascheibe, 12 years ago

#1706 adapted outdated plugins to changes in IAnalyzer

File size: 22.6 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System.Collections.Generic;
23using System.Linq;
24using HeuristicLab.Analysis;
25using HeuristicLab.Common;
26using HeuristicLab.Core;
27using HeuristicLab.Data;
28using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
29using HeuristicLab.Operators;
30using HeuristicLab.Optimization;
31using HeuristicLab.Parameters;
32using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
33using HeuristicLab.PluginInfrastructure;
34using HeuristicLab.Problems.DataAnalysis.Regression.Symbolic;
35using HeuristicLab.Problems.DataAnalysis.Regression.Symbolic.Analyzers;
36using HeuristicLab.Problems.DataAnalysis.Symbolic;
37
38namespace HeuristicLab.Problems.DataAnalysis.Classification {
39  [Item("ValidationBestSymbolicClassificationSolutionAnalyzer", "An operator that analyzes the validation best symbolic classification solution.")]
40  [StorableClass]
41  [NonDiscoverableType]
42  public class ValidationBestSymbolicClassificationSolutionAnalyzer : SingleSuccessorOperator, ISymbolicClassificationAnalyzer {
43    private const string MaximizationParameterName = "Maximization";
44    private const string GenerationsParameterName = "Generations";
45    private const string RandomParameterName = "Random";
46    private const string SymbolicExpressionTreeParameterName = "SymbolicExpressionTree";
47    private const string SymbolicExpressionTreeInterpreterParameterName = "SymbolicExpressionTreeInterpreter";
48
49    private const string ClassificationProblemDataParameterName = "ClassificationProblemData";
50    private const string EvaluatorParameterName = "Evaluator";
51    private const string ValidationSamplesStartParameterName = "SamplesStart";
52    private const string ValidationSamplesEndParameterName = "SamplesEnd";
53    private const string RelativeNumberOfEvaluatedSamplesParameterName = "RelativeNumberOfEvaluatedSamples";
54    private const string UpperEstimationLimitParameterName = "UpperEstimationLimit";
55    private const string LowerEstimationLimitParameterName = "LowerEstimationLimit";
56    private const string CalculateSolutionComplexityParameterName = "CalculateSolutionComplexity";
57    private const string ApplyLinearScalingParameterName = "ApplyLinearScaling";
58
59    private const string ResultsParameterName = "Results";
60    private const string BestValidationQualityParameterName = "Best validation quality";
61    private const string BestValidationSolutionParameterName = "Best validation solution";
62    private const string BestSolutionAccuracyTrainingParameterName = "Best solution accuracy (training)";
63    private const string BestSolutionAccuracyTestParameterName = "Best solution accuracy (test)";
64    private const string BestSolutionLengthParameterName = "Best solution length (on validation set)";
65    private const string BestSolutionHeightParameterName = "Best solution height (on validation set)";
66    private const string VariableFrequenciesParameterName = "VariableFrequencies";
67
68    public virtual bool EnabledByDefault {
69      get { return true; }
70    }
71
72    #region parameter properties
73    public ILookupParameter<BoolValue> MaximizationParameter {
74      get { return (ILookupParameter<BoolValue>)Parameters[MaximizationParameterName]; }
75    }
76    public ILookupParameter<IntValue> GenerationsParameter {
77      get { return (ILookupParameter<IntValue>)Parameters[GenerationsParameterName]; }
78    }
79    public ILookupParameter<IRandom> RandomParameter {
80      get { return (ILookupParameter<IRandom>)Parameters[RandomParameterName]; }
81    }
82    public ScopeTreeLookupParameter<SymbolicExpressionTree> SymbolicExpressionTreeParameter {
83      get { return (ScopeTreeLookupParameter<SymbolicExpressionTree>)Parameters[SymbolicExpressionTreeParameterName]; }
84    }
85    public IValueLookupParameter<ISymbolicExpressionTreeInterpreter> SymbolicExpressionTreeInterpreterParameter {
86      get { return (IValueLookupParameter<ISymbolicExpressionTreeInterpreter>)Parameters[SymbolicExpressionTreeInterpreterParameterName]; }
87    }
88    public ILookupParameter<ClassificationProblemData> ClassificationProblemDataParameter {
89      get { return (ILookupParameter<ClassificationProblemData>)Parameters[ClassificationProblemDataParameterName]; }
90    }
91    public ILookupParameter<ISymbolicClassificationEvaluator> EvaluatorParameter {
92      get { return (ILookupParameter<ISymbolicClassificationEvaluator>)Parameters[EvaluatorParameterName]; }
93    }
94    public IValueLookupParameter<IntValue> ValidationSamplesStartParameter {
95      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesStartParameterName]; }
96    }
97    public IValueLookupParameter<IntValue> ValidationSamplesEndParameter {
98      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesEndParameterName]; }
99    }
100    public IValueParameter<PercentValue> RelativeNumberOfEvaluatedSamplesParameter {
101      get { return (IValueParameter<PercentValue>)Parameters[RelativeNumberOfEvaluatedSamplesParameterName]; }
102    }
103    public IValueLookupParameter<DoubleValue> UpperEstimationLimitParameter {
104      get { return (IValueLookupParameter<DoubleValue>)Parameters[UpperEstimationLimitParameterName]; }
105    }
106    public IValueLookupParameter<DoubleValue> LowerEstimationLimitParameter {
107      get { return (IValueLookupParameter<DoubleValue>)Parameters[LowerEstimationLimitParameterName]; }
108    }
109    public IValueLookupParameter<BoolValue> ApplyLinearScalingParameter {
110      get { return (IValueLookupParameter<BoolValue>)Parameters[ApplyLinearScalingParameterName]; }
111    }
112    public ILookupParameter<DataTable> VariableFrequenciesParameter {
113      get { return (ILookupParameter<DataTable>)Parameters[VariableFrequenciesParameterName]; }
114    }
115    public IValueParameter<BoolValue> CalculateSolutionComplexityParameter {
116      get { return (IValueParameter<BoolValue>)Parameters[CalculateSolutionComplexityParameterName]; }
117    }
118
119    public ILookupParameter<ResultCollection> ResultsParameter {
120      get { return (ILookupParameter<ResultCollection>)Parameters[ResultsParameterName]; }
121    }
122    public ILookupParameter<DoubleValue> BestValidationQualityParameter {
123      get { return (ILookupParameter<DoubleValue>)Parameters[BestValidationQualityParameterName]; }
124    }
125    public ILookupParameter<SymbolicClassificationSolution> BestValidationSolutionParameter {
126      get { return (ILookupParameter<SymbolicClassificationSolution>)Parameters[BestValidationSolutionParameterName]; }
127    }
128    public ILookupParameter<DoubleValue> BestSolutionAccuracyTrainingParameter {
129      get { return (ILookupParameter<DoubleValue>)Parameters[BestSolutionAccuracyTrainingParameterName]; }
130    }
131    public ILookupParameter<DoubleValue> BestSolutionAccuracyTestParameter {
132      get { return (ILookupParameter<DoubleValue>)Parameters[BestSolutionAccuracyTestParameterName]; }
133    }
134    public ILookupParameter<IntValue> BestSolutionLengthParameter {
135      get { return (ILookupParameter<IntValue>)Parameters[BestSolutionLengthParameterName]; }
136    }
137    public ILookupParameter<IntValue> BestSolutionHeightParameter {
138      get { return (ILookupParameter<IntValue>)Parameters[BestSolutionHeightParameterName]; }
139    }
140    #endregion
141    #region properties
142    public BoolValue Maximization {
143      get { return MaximizationParameter.ActualValue; }
144    }
145    public IntValue Generations {
146      get { return GenerationsParameter.ActualValue; }
147    }
148    public IRandom Random {
149      get { return RandomParameter.ActualValue; }
150    }
151    public ItemArray<SymbolicExpressionTree> SymbolicExpressionTree {
152      get { return SymbolicExpressionTreeParameter.ActualValue; }
153    }
154    public ISymbolicExpressionTreeInterpreter SymbolicExpressionTreeInterpreter {
155      get { return SymbolicExpressionTreeInterpreterParameter.ActualValue; }
156    }
157
158    public ClassificationProblemData ClassificationProblemData {
159      get { return ClassificationProblemDataParameter.ActualValue; }
160    }
161    public ISymbolicClassificationEvaluator Evaluator {
162      get { return EvaluatorParameter.ActualValue; }
163    }
164    public IntValue ValidiationSamplesStart {
165      get { return ValidationSamplesStartParameter.ActualValue; }
166    }
167    public IntValue ValidationSamplesEnd {
168      get { return ValidationSamplesEndParameter.ActualValue; }
169    }
170    public PercentValue RelativeNumberOfEvaluatedSamples {
171      get { return RelativeNumberOfEvaluatedSamplesParameter.Value; }
172    }
173    public DoubleValue UpperEstimationLimit {
174      get { return UpperEstimationLimitParameter.ActualValue; }
175    }
176    public DoubleValue LowerEstimationLimit {
177      get { return LowerEstimationLimitParameter.ActualValue; }
178    }
179    public BoolValue ApplyLinearScaling {
180      get { return ApplyLinearScalingParameter.ActualValue; }
181      set { ApplyLinearScalingParameter.ActualValue = value; }
182    }
183    public DataTable VariableFrequencies {
184      get { return VariableFrequenciesParameter.ActualValue; }
185    }
186    public BoolValue CalculateSolutionComplexity {
187      get { return CalculateSolutionComplexityParameter.Value; }
188      set { CalculateSolutionComplexityParameter.Value = value; }
189    }
190
191    public ResultCollection Results {
192      get { return ResultsParameter.ActualValue; }
193    }
194    public DoubleValue BestValidationQuality {
195      get { return BestValidationQualityParameter.ActualValue; }
196      protected set { BestValidationQualityParameter.ActualValue = value; }
197    }
198    public SymbolicClassificationSolution BestValidationSolution {
199      get { return BestValidationSolutionParameter.ActualValue; }
200      protected set { BestValidationSolutionParameter.ActualValue = value; }
201    }
202    public DoubleValue BestSolutionAccuracyTraining {
203      get { return BestSolutionAccuracyTrainingParameter.ActualValue; }
204      protected set { BestSolutionAccuracyTrainingParameter.ActualValue = value; }
205    }
206    public DoubleValue BestSolutionAccuracyTest {
207      get { return BestSolutionAccuracyTestParameter.ActualValue; }
208      protected set { BestSolutionAccuracyTestParameter.ActualValue = value; }
209    }
210    public IntValue BestSolutionLength {
211      get { return BestSolutionLengthParameter.ActualValue; }
212      set { BestSolutionLengthParameter.ActualValue = value; }
213    }
214    public IntValue BestSolutionHeight {
215      get { return BestSolutionHeightParameter.ActualValue; }
216      set { BestSolutionHeightParameter.ActualValue = value; }
217    }
218    #endregion
219
220    [StorableConstructor]
221    protected ValidationBestSymbolicClassificationSolutionAnalyzer(bool deserializing) : base(deserializing) { }
222    protected ValidationBestSymbolicClassificationSolutionAnalyzer(ValidationBestSymbolicClassificationSolutionAnalyzer original, Cloner cloner)
223      : base(original, cloner) {
224    }
225    public ValidationBestSymbolicClassificationSolutionAnalyzer()
226      : base() {
227      Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
228      Parameters.Add(new LookupParameter<IntValue>(GenerationsParameterName, "The number of generations calculated so far."));
229      Parameters.Add(new LookupParameter<IRandom>(RandomParameterName, "The random generator to use."));
230      Parameters.Add(new ScopeTreeLookupParameter<SymbolicExpressionTree>(SymbolicExpressionTreeParameterName, "The symbolic expression trees to analyze."));
231      Parameters.Add(new ValueLookupParameter<ISymbolicExpressionTreeInterpreter>(SymbolicExpressionTreeInterpreterParameterName, "The interpreter that should be used for the analysis of symbolic expression trees."));
232      Parameters.Add(new LookupParameter<ClassificationProblemData>(ClassificationProblemDataParameterName, "The problem data for which the symbolic expression tree is a solution."));
233      Parameters.Add(new LookupParameter<ISymbolicClassificationEvaluator>(EvaluatorParameterName, "The evaluator which should be used to evaluate the solution on the validation set."));
234      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesStartParameterName, "The first index of the validation partition of the data set."));
235      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesEndParameterName, "The last index of the validation partition of the data set."));
236      Parameters.Add(new ValueParameter<PercentValue>(RelativeNumberOfEvaluatedSamplesParameterName, "The relative number of samples of the dataset partition, which should be randomly chosen for evaluation between the start and end index.", new PercentValue(1)));
237      Parameters.Add(new ValueLookupParameter<DoubleValue>(UpperEstimationLimitParameterName, "The upper estimation limit that was set for the evaluation of the symbolic expression trees."));
238      Parameters.Add(new ValueLookupParameter<DoubleValue>(LowerEstimationLimitParameterName, "The lower estimation limit that was set for the evaluation of the symbolic expression trees."));
239      Parameters.Add(new LookupParameter<DataTable>(VariableFrequenciesParameterName, "The variable frequencies table to use for the calculation of variable impacts"));
240      Parameters.Add(new ValueParameter<BoolValue>(CalculateSolutionComplexityParameterName, "Determines if the length and height of the validation best solution should be calculated.", new BoolValue(true)));
241      Parameters.Add(new ValueLookupParameter<BoolValue>(ApplyLinearScalingParameterName, "The switch determines if the best solution should be linearly scaled on the whole training set.", new BoolValue(false)));
242
243      Parameters.Add(new ValueLookupParameter<ResultCollection>(ResultsParameterName, "The results collection where the analysis values should be stored."));
244      Parameters.Add(new LookupParameter<DoubleValue>(BestValidationQualityParameterName, "The validation quality of the best solution in the current run."));
245      Parameters.Add(new LookupParameter<SymbolicClassificationSolution>(BestValidationSolutionParameterName, "The best solution on the validation data found in the current run."));
246      Parameters.Add(new LookupParameter<DoubleValue>(BestSolutionAccuracyTrainingParameterName, "The training accuracy of the best solution."));
247      Parameters.Add(new LookupParameter<DoubleValue>(BestSolutionAccuracyTestParameterName, "The test accuracy of the best solution."));
248      Parameters.Add(new LookupParameter<IntValue>(BestSolutionLengthParameterName, "The length of the best symbolic classification solution."));
249      Parameters.Add(new LookupParameter<IntValue>(BestSolutionHeightParameterName, "The height of the best symbolic classification solution."));
250    }
251
252    [StorableHook(HookType.AfterDeserialization)]
253    private void AfterDeserialization() {
254      if (!Parameters.ContainsKey(CalculateSolutionComplexityParameterName)) {
255        Parameters.Add(new ValueParameter<BoolValue>(CalculateSolutionComplexityParameterName, "Determines if the length and height of the validation best solution should be calculated.", new BoolValue(true)));
256      }
257      if (!Parameters.ContainsKey(BestSolutionLengthParameterName)) {
258        Parameters.Add(new LookupParameter<IntValue>(BestSolutionLengthParameterName, "The length of the best symbolic classification solution."));
259      }
260      if (!Parameters.ContainsKey(BestSolutionHeightParameterName)) {
261        Parameters.Add(new LookupParameter<IntValue>(BestSolutionHeightParameterName, "The height of the best symbolic classification solution."));
262      }
263      if (!Parameters.ContainsKey(ApplyLinearScalingParameterName)) {
264        Parameters.Add(new ValueLookupParameter<BoolValue>(ApplyLinearScalingParameterName, "The switch determines if the best solution should be linearly scaled on the whole training set.", new BoolValue(false)));
265      }
266    }
267
268    public override IDeepCloneable Clone(Cloner cloner) {
269      return new ValidationBestSymbolicClassificationSolutionAnalyzer(this, cloner);
270    }
271
272    public override IOperation Apply() {
273      var trees = SymbolicExpressionTree;
274      string targetVariable = ClassificationProblemData.TargetVariable.Value;
275
276      // select a random subset of rows in the validation set
277      int validationStart = ValidiationSamplesStart.Value;
278      int validationEnd = ValidationSamplesEnd.Value;
279      int seed = Random.Next();
280      int count = (int)((validationEnd - validationStart) * RelativeNumberOfEvaluatedSamples.Value);
281      if (count == 0) count = 1;
282      IEnumerable<int> rows = RandomEnumerable.SampleRandomNumbers(seed, validationStart, validationEnd, count)
283         .Where(row => row < ClassificationProblemData.TestSamplesStart.Value || ClassificationProblemData.TestSamplesEnd.Value <= row);
284
285      double upperEstimationLimit = UpperEstimationLimit != null ? UpperEstimationLimit.Value : double.PositiveInfinity;
286      double lowerEstimationLimit = LowerEstimationLimit != null ? LowerEstimationLimit.Value : double.NegativeInfinity;
287
288      double bestQuality = Maximization.Value ? double.NegativeInfinity : double.PositiveInfinity;
289      SymbolicExpressionTree bestTree = null;
290
291      foreach (var tree in trees) {
292        double quality = Evaluator.Evaluate(SymbolicExpressionTreeInterpreter, tree,
293          lowerEstimationLimit, upperEstimationLimit, ClassificationProblemData.Dataset,
294          targetVariable, rows);
295
296        if ((Maximization.Value && quality > bestQuality) ||
297            (!Maximization.Value && quality < bestQuality)) {
298          bestQuality = quality;
299          bestTree = tree;
300        }
301      }
302
303      // if the best validation tree is better than the current best solution => update
304      bool newBest =
305        BestValidationQuality == null ||
306        (Maximization.Value && bestQuality > BestValidationQuality.Value) ||
307        (!Maximization.Value && bestQuality < BestValidationQuality.Value);
308      if (newBest) {
309        if (ApplyLinearScaling.Value) {
310          double alpha, beta;
311          SymbolicRegressionScaledMeanSquaredErrorEvaluator.Calculate(SymbolicExpressionTreeInterpreter, bestTree,
312            lowerEstimationLimit, upperEstimationLimit,
313            ClassificationProblemData.Dataset, targetVariable,
314            ClassificationProblemData.TrainingIndizes, out beta, out alpha);
315
316          // scale tree for solution
317          bestTree = SymbolicRegressionSolutionLinearScaler.Scale(bestTree, alpha, beta);
318        }
319        var model = new SymbolicRegressionModel((ISymbolicExpressionTreeInterpreter)SymbolicExpressionTreeInterpreter.Clone(),
320          bestTree);
321
322        if (BestValidationSolution == null) {
323          BestValidationSolution = new SymbolicClassificationSolution(ClassificationProblemData, model, LowerEstimationLimit.Value, UpperEstimationLimit.Value);
324          BestValidationSolution.Name = BestValidationSolutionParameterName;
325          BestValidationSolution.Description = "Best solution on validation partition found over the whole run.";
326          BestValidationQuality = new DoubleValue(bestQuality);
327        } else {
328          BestValidationSolution.Model = model;
329          BestValidationQuality.Value = bestQuality;
330        }
331
332        UpdateBestSolutionResults();
333      }
334      return base.Apply();
335    }
336
337    private void UpdateBestSolutionResults() {
338      if (CalculateSolutionComplexity.Value) {
339        BestSolutionLength = new IntValue(BestValidationSolution.Model.SymbolicExpressionTree.Size);
340        BestSolutionHeight = new IntValue(BestValidationSolution.Model.SymbolicExpressionTree.Height);
341        if (!Results.ContainsKey(BestSolutionLengthParameterName)) {
342          Results.Add(new Result(BestSolutionLengthParameterName, "Length of the best solution on the validation set", new IntValue()));
343          Results.Add(new Result(BestSolutionHeightParameterName, "Height of the best solution on the validation set", new IntValue()));
344        }
345        Results[BestSolutionLengthParameterName].Value = BestSolutionLength;
346        Results[BestSolutionHeightParameterName].Value = BestSolutionHeight;
347      }
348
349      BestSymbolicRegressionSolutionAnalyzer.UpdateBestSolutionResults(BestValidationSolution, ClassificationProblemData, Results, Generations, VariableFrequencies);
350
351      IEnumerable<double> trainingValues = ClassificationProblemData.Dataset.GetEnumeratedVariableValues(
352        ClassificationProblemData.TargetVariable.Value, ClassificationProblemData.TrainingIndizes);
353      IEnumerable<double> testValues = ClassificationProblemData.Dataset.GetEnumeratedVariableValues(
354        ClassificationProblemData.TargetVariable.Value, ClassificationProblemData.TestIndizes);
355
356      OnlineAccuracyEvaluator accuracyEvaluator = new OnlineAccuracyEvaluator();
357      var originalEnumerator = trainingValues.GetEnumerator();
358      var estimatedEnumerator = BestValidationSolution.EstimatedTrainingClassValues.GetEnumerator();
359      while (originalEnumerator.MoveNext() & estimatedEnumerator.MoveNext()) {
360        accuracyEvaluator.Add(originalEnumerator.Current, estimatedEnumerator.Current);
361      }
362      double trainingAccuracy = accuracyEvaluator.Accuracy;
363
364      accuracyEvaluator.Reset();
365      originalEnumerator = testValues.GetEnumerator();
366      estimatedEnumerator = BestValidationSolution.EstimatedTestClassValues.GetEnumerator();
367      while (originalEnumerator.MoveNext() & estimatedEnumerator.MoveNext()) {
368        accuracyEvaluator.Add(originalEnumerator.Current, estimatedEnumerator.Current);
369      }
370      double testAccuracy = accuracyEvaluator.Accuracy;
371
372      if (!Results.ContainsKey(BestSolutionAccuracyTrainingParameterName)) {
373        BestSolutionAccuracyTraining = new DoubleValue(trainingAccuracy);
374        BestSolutionAccuracyTest = new DoubleValue(testAccuracy);
375
376        Results.Add(new Result(BestSolutionAccuracyTrainingParameterName, BestSolutionAccuracyTraining));
377        Results.Add(new Result(BestSolutionAccuracyTestParameterName, BestSolutionAccuracyTest));
378      } else {
379        BestSolutionAccuracyTraining.Value = trainingAccuracy;
380        BestSolutionAccuracyTest.Value = testAccuracy;
381      }
382    }
383  }
384}
Note: See TracBrowser for help on using the repository browser.