Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Classification/HeuristicLab.Problems.DataAnalysis.Classification/3.3/Symbolic/Analyzer/ValidationBestSymbolicClassificationSolutionAnalyzer.cs @ 4391

Last change on this file since 4391 was 4391, checked in by mkommend, 14 years ago

updated classification branch (ticket #939)

File size: 17.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System.Collections.Generic;
23using System.Linq;
24using HeuristicLab.Analysis;
25using HeuristicLab.Core;
26using HeuristicLab.Data;
27using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
28using HeuristicLab.Operators;
29using HeuristicLab.Optimization;
30using HeuristicLab.Parameters;
31using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
32using HeuristicLab.Problems.DataAnalysis.Regression.Symbolic;
33using HeuristicLab.Problems.DataAnalysis.Regression.Symbolic.Analyzers;
34using HeuristicLab.Problems.DataAnalysis.Symbolic;
35
36namespace HeuristicLab.Problems.DataAnalysis.Classification {
37  [Item("ValidationBestSymbolicClassificationSolutionAnalyzer", "An operator that analyzes the validation best symbolic classification solution.")]
38  [StorableClass]
39  public class ValidationBestSymbolicClassificationSolutionAnalyzer : SingleSuccessorOperator, ISymbolicClassificationAnalyzer {
40    private const string MaximizationParameterName = "Maximization";
41    private const string GenerationsParameterName = "Generations";
42    private const string RandomParameterName = "Random";
43    private const string SymbolicExpressionTreeParameterName = "SymbolicExpressionTree";
44    private const string SymbolicExpressionTreeInterpreterParameterName = "SymbolicExpressionTreeInterpreter";
45
46    private const string ClassificationProblemDataParameterName = "ClassificationProblemData";
47    private const string EvaluatorParameterName = "Evaluator";
48    private const string ValidationSamplesStartParameterName = "SamplesStart";
49    private const string ValidationSamplesEndParameterName = "SamplesEnd";
50    private const string RelativeNumberOfEvaluatedSamplesParameterName = "RelativeNumberOfEvaluatedSamples";
51    private const string UpperEstimationLimitParameterName = "UpperEstimationLimit";
52    private const string LowerEstimationLimitParameterName = "LowerEstimationLimit";
53
54    private const string ResultsParameterName = "Results";
55    private const string BestValidationQualityParameterName = "Best validation quality";
56    private const string BestValidationSolutionParameterName = "Best validation solution";
57    private const string BestSolutionGenerationParameterName = "Best solution generation";
58    private const string BestSolutionInputvariableCountParameterName = "Variables used by best solution";
59    private const string VariableFrequenciesParameterName = "VariableFrequencies";
60    private const string VariableImpactsParameterName = "Variable Impacts";
61
62    #region parameter properties
63    public ILookupParameter<BoolValue> MaximizationParameter {
64      get { return (ILookupParameter<BoolValue>)Parameters[MaximizationParameterName]; }
65    }
66    public ILookupParameter<IntValue> GenerationsParameter {
67      get { return (ILookupParameter<IntValue>)Parameters[GenerationsParameterName]; }
68    }
69    public ILookupParameter<IRandom> RandomParameter {
70      get { return (ILookupParameter<IRandom>)Parameters[RandomParameterName]; }
71    }
72    public ScopeTreeLookupParameter<SymbolicExpressionTree> SymbolicExpressionTreeParameter {
73      get { return (ScopeTreeLookupParameter<SymbolicExpressionTree>)Parameters[SymbolicExpressionTreeParameterName]; }
74    }
75    public IValueLookupParameter<ISymbolicExpressionTreeInterpreter> SymbolicExpressionTreeInterpreterParameter {
76      get { return (IValueLookupParameter<ISymbolicExpressionTreeInterpreter>)Parameters[SymbolicExpressionTreeInterpreterParameterName]; }
77    }
78
79    public ILookupParameter<ClassificationProblemData> ClassificationProblemDataParameter {
80      get { return (ILookupParameter<ClassificationProblemData>)Parameters[ClassificationProblemDataParameterName]; }
81    }
82    public ILookupParameter<ISymbolicClassificationEvaluator> EvaluatorParameter {
83      get { return (ILookupParameter<ISymbolicClassificationEvaluator>)Parameters[EvaluatorParameterName]; }
84    }
85    public IValueLookupParameter<IntValue> ValidationSamplesStartParameter {
86      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesStartParameterName]; }
87    }
88    public IValueLookupParameter<IntValue> ValidationSamplesEndParameter {
89      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesEndParameterName]; }
90    }
91    public IValueParameter<PercentValue> RelativeNumberOfEvaluatedSamplesParameter {
92      get { return (IValueParameter<PercentValue>)Parameters[RelativeNumberOfEvaluatedSamplesParameterName]; }
93    }
94    public IValueLookupParameter<DoubleValue> UpperEstimationLimitParameter {
95      get { return (IValueLookupParameter<DoubleValue>)Parameters[UpperEstimationLimitParameterName]; }
96    }
97    public IValueLookupParameter<DoubleValue> LowerEstimationLimitParameter {
98      get { return (IValueLookupParameter<DoubleValue>)Parameters[LowerEstimationLimitParameterName]; }
99    }
100    public ILookupParameter<DataTable> VariableFrequenciesParameter {
101      get { return (ILookupParameter<DataTable>)Parameters[VariableFrequenciesParameterName]; }
102    }
103
104    public ILookupParameter<ResultCollection> ResultsParameter {
105      get { return (ILookupParameter<ResultCollection>)Parameters[ResultsParameterName]; }
106    }
107    public ILookupParameter<DoubleValue> BestValidationQualityParameter {
108      get { return (ILookupParameter<DoubleValue>)Parameters[BestValidationQualityParameterName]; }
109    }
110    public ILookupParameter<SymbolicClassificationSolution> BestValidationSolutionParameter {
111      get { return (ILookupParameter<SymbolicClassificationSolution>)Parameters[BestValidationSolutionParameterName]; }
112    }
113    public ILookupParameter<IntValue> BestSolutionGenerationParameter {
114      get { return (ILookupParameter<IntValue>)Parameters[BestSolutionGenerationParameterName]; }
115    }
116    public ILookupParameter<DoubleMatrix> VariableImpactsParameter {
117      get { return (ILookupParameter<DoubleMatrix>)Parameters[VariableImpactsParameterName]; }
118    }
119    public ILookupParameter<IntValue> BestSolutionInputvariableCountParameter {
120      get { return (ILookupParameter<IntValue>)Parameters[BestSolutionInputvariableCountParameterName]; }
121    }
122    #endregion
123    #region properties
124    public BoolValue Maximization {
125      get { return MaximizationParameter.ActualValue; }
126    }
127    public IntValue Generations {
128      get { return GenerationsParameter.ActualValue; }
129    }
130    public IRandom Random {
131      get { return RandomParameter.ActualValue; }
132    }
133    public ItemArray<SymbolicExpressionTree> SymbolicExpressionTree {
134      get { return SymbolicExpressionTreeParameter.ActualValue; }
135    }
136    public ISymbolicExpressionTreeInterpreter SymbolicExpressionTreeInterpreter {
137      get { return SymbolicExpressionTreeInterpreterParameter.ActualValue; }
138    }
139
140    public ClassificationProblemData ClassificationProblemData {
141      get { return ClassificationProblemDataParameter.ActualValue; }
142    }
143    public ISymbolicClassificationEvaluator Evaluator {
144      get { return EvaluatorParameter.ActualValue; }
145    }
146    public IntValue ValidiationSamplesStart {
147      get { return ValidationSamplesStartParameter.ActualValue; }
148    }
149    public IntValue ValidationSamplesEnd {
150      get { return ValidationSamplesEndParameter.ActualValue; }
151    }
152    public PercentValue RelativeNumberOfEvaluatedSamples {
153      get { return RelativeNumberOfEvaluatedSamplesParameter.Value; }
154    }
155    public DoubleValue UpperEstimationLimit {
156      get { return UpperEstimationLimitParameter.ActualValue; }
157    }
158    public DoubleValue LowerEstimationLimit {
159      get { return LowerEstimationLimitParameter.ActualValue; }
160    }
161    public DataTable VariableFrequencies {
162      get { return VariableFrequenciesParameter.ActualValue; }
163    }
164
165    public ResultCollection Results {
166      get { return ResultsParameter.ActualValue; }
167    }
168    public DoubleValue BestValidationQuality {
169      get { return BestValidationQualityParameter.ActualValue; }
170      protected set { BestValidationQualityParameter.ActualValue = value; }
171    }
172    public SymbolicClassificationSolution BestValidationSolution {
173      get { return BestValidationSolutionParameter.ActualValue; }
174      protected set { BestValidationSolutionParameter.ActualValue = value; }
175    }
176    public IntValue BestSolutionGeneration {
177      get { return BestSolutionGenerationParameter.ActualValue; }
178      protected set { BestSolutionGenerationParameter.ActualValue = value; }
179    }
180    public IntValue BestSolutionInputvariableCount {
181      get { return BestSolutionInputvariableCountParameter.ActualValue; }
182      protected set { BestSolutionInputvariableCountParameter.ActualValue = value; }
183    }
184    public DoubleMatrix VariableImpacts {
185      get { return VariableImpactsParameter.ActualValue; }
186      protected set { VariableImpactsParameter.ActualValue = value; }
187    }
188    #endregion
189
190    public ValidationBestSymbolicClassificationSolutionAnalyzer()
191      : base() {
192      Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
193      Parameters.Add(new LookupParameter<IntValue>(GenerationsParameterName, "The number of generations calculated so far."));
194      Parameters.Add(new LookupParameter<IRandom>(RandomParameterName, "The random generator to use."));
195      Parameters.Add(new ScopeTreeLookupParameter<SymbolicExpressionTree>(SymbolicExpressionTreeParameterName, "The symbolic expression trees to analyze."));
196      Parameters.Add(new ValueLookupParameter<ISymbolicExpressionTreeInterpreter>(SymbolicExpressionTreeInterpreterParameterName, "The interpreter that should be used for the analysis of symbolic expression trees."));
197
198      Parameters.Add(new LookupParameter<ClassificationProblemData>(ClassificationProblemDataParameterName, "The problem data for which the symbolic expression tree is a solution."));
199      Parameters.Add(new LookupParameter<ISymbolicClassificationEvaluator>(EvaluatorParameterName, "The evaluator which should be used to evaluate the solution on the validation set."));
200      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesStartParameterName, "The first index of the validation partition of the data set."));
201      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesEndParameterName, "The last index of the validation partition of the data set."));
202      Parameters.Add(new ValueParameter<PercentValue>(RelativeNumberOfEvaluatedSamplesParameterName, "The relative number of samples of the dataset partition, which should be randomly chosen for evaluation between the start and end index.", new PercentValue(1)));
203      Parameters.Add(new ValueLookupParameter<DoubleValue>(UpperEstimationLimitParameterName, "The upper estimation limit that was set for the evaluation of the symbolic expression trees."));
204      Parameters.Add(new ValueLookupParameter<DoubleValue>(LowerEstimationLimitParameterName, "The lower estimation limit that was set for the evaluation of the symbolic expression trees."));
205      Parameters.Add(new LookupParameter<DataTable>(VariableFrequenciesParameterName, "The variable frequencies table to use for the calculation of variable impacts"));
206
207      Parameters.Add(new ValueLookupParameter<ResultCollection>(ResultsParameterName, "The results collection where the analysis values should be stored."));
208      Parameters.Add(new LookupParameter<DoubleValue>(BestValidationQualityParameterName, "The validation quality of the best solution in the current run."));
209      Parameters.Add(new LookupParameter<SymbolicClassificationSolution>(BestValidationSolutionParameterName, "The best solution on the validation data found in the current run."));
210      Parameters.Add(new LookupParameter<IntValue>(BestSolutionGenerationParameterName, "The generation in which the best solution was found."));
211      Parameters.Add(new LookupParameter<DoubleMatrix>(VariableImpactsParameterName, "The impacts of the input variables calculated during the run."));
212      Parameters.Add(new LookupParameter<IntValue>(BestSolutionInputvariableCountParameterName, "The number of input variables used by the best solution."));
213
214    }
215
216    [StorableConstructor]
217    private ValidationBestSymbolicClassificationSolutionAnalyzer(bool deserializing) : base(deserializing) { }
218
219    public override IOperation Apply() {
220      var trees = SymbolicExpressionTree;
221      string targetVariable = ClassificationProblemData.TargetVariable.Value;
222
223      // select a random subset of rows in the validation set
224      int validationStart = ValidiationSamplesStart.Value;
225      int validationEnd = ValidationSamplesEnd.Value;
226      int seed = Random.Next();
227      int count = (int)((validationEnd - validationStart) * RelativeNumberOfEvaluatedSamples.Value);
228      if (count == 0) count = 1;
229      IEnumerable<int> rows = RandomEnumerable.SampleRandomNumbers(seed, validationStart, validationEnd, count);
230
231      double upperEstimationLimit = UpperEstimationLimit != null ? UpperEstimationLimit.Value : double.PositiveInfinity;
232      double lowerEstimationLimit = LowerEstimationLimit != null ? LowerEstimationLimit.Value : double.NegativeInfinity;
233
234      double bestQuality = Maximization.Value ? double.NegativeInfinity : double.PositiveInfinity;
235      SymbolicExpressionTree bestTree = null;
236
237      foreach (var tree in trees) {
238        double quality = Evaluator.Evaluate(SymbolicExpressionTreeInterpreter, tree,
239          lowerEstimationLimit, upperEstimationLimit, ClassificationProblemData.Dataset,
240          targetVariable, rows);
241
242        if ((Maximization.Value && quality > bestQuality) ||
243            (!Maximization.Value && quality < bestQuality)) {
244          bestQuality = quality;
245          bestTree = tree;
246        }
247      }
248
249      // if the best validation tree is better than the current best solution => update
250      bool newBest =
251        BestValidationQuality == null ||
252        (Maximization.Value && bestQuality > BestValidationQuality.Value) ||
253        (!Maximization.Value && bestQuality < BestValidationQuality.Value);
254      if (newBest) {
255        double alpha, beta;
256        int trainingStart = ClassificationProblemData.TrainingSamplesStart.Value;
257        int trainingEnd = ClassificationProblemData.TrainingSamplesEnd.Value;
258        IEnumerable<int> trainingRows = Enumerable.Range(trainingStart, trainingEnd - trainingStart);
259        SymbolicRegressionScaledMeanSquaredErrorEvaluator.Calculate(SymbolicExpressionTreeInterpreter, bestTree,
260          lowerEstimationLimit, upperEstimationLimit,
261          ClassificationProblemData.Dataset, targetVariable,
262          trainingRows, out beta, out alpha);
263
264        // scale tree for solution
265        var scaledTree = SymbolicRegressionSolutionLinearScaler.Scale(bestTree, alpha, beta);
266        var model = new SymbolicRegressionModel((ISymbolicExpressionTreeInterpreter)SymbolicExpressionTreeInterpreter.Clone(),
267          scaledTree);
268
269        if (BestValidationSolution == null) {
270          BestValidationSolution = new SymbolicClassificationSolution(ClassificationProblemData, model, LowerEstimationLimit.Value, UpperEstimationLimit.Value);
271          BestValidationSolution.Name = BestValidationSolutionParameterName;
272          BestValidationSolution.Description = "Best solution on validation partition found over the whole run.";
273          BestValidationQuality = new DoubleValue(bestQuality);
274          BestSolutionGeneration = (IntValue)Generations.Clone();
275          BestSolutionInputvariableCount = new IntValue(BestValidationSolution.Model.InputVariables.Count());
276
277          Results.Add(new Result(BestValidationSolutionParameterName, BestValidationSolution));
278          Results.Add(new Result(BestValidationQualityParameterName, BestValidationQuality));
279          Results.Add(new Result(BestSolutionGenerationParameterName, BestSolutionGeneration));
280
281          Results.Add(new Result(BestSolutionInputvariableCountParameterName, BestSolutionInputvariableCount));
282
283          if (VariableFrequencies != null) {
284            VariableImpacts = CalculateVariableImpacts(VariableFrequencies);
285            Results.Add(new Result(VariableImpactsParameterName, VariableImpacts));
286          }
287
288        } else {
289          BestValidationSolution.Model = model;
290          BestValidationQuality.Value = bestQuality;
291          BestSolutionGeneration.Value = Generations.Value;
292          BestSolutionInputvariableCount.Value = BestValidationSolution.Model.InputVariables.Count();
293
294          if (VariableFrequencies != null) {
295            VariableImpacts = CalculateVariableImpacts(VariableFrequencies);
296            Results[VariableImpactsParameterName].Value = VariableImpacts;
297          }
298        }
299      }
300      return base.Apply();
301    }
302
303    private static DoubleMatrix CalculateVariableImpacts(DataTable variableFrequencies) {
304      if (variableFrequencies != null) {
305        var impacts = new DoubleMatrix(variableFrequencies.Rows.Count, 1, new string[] { "Impact" }, variableFrequencies.Rows.Select(x => x.Name));
306        impacts.SortableView = true;
307        int rowIndex = 0;
308        foreach (var dataRow in variableFrequencies.Rows) {
309          string variableName = dataRow.Name;
310          impacts[rowIndex++, 0] = dataRow.Values.Average();
311        }
312        return impacts;
313      } else return new DoubleMatrix(1, 1);
314    }
315  }
316}
Note: See TracBrowser for help on using the repository browser.