Free cookie consent management tool by TermsFeed Policy Generator

source: branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer.cs @ 4255

Last change on this file since 4255 was 4255, checked in by gkronber, 14 years ago

Added complexity reduction scheme based on validation performance for CPP. #1142

File size: 21.1 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System.Collections.Generic;
23using System.Linq;
24using HeuristicLab.Analysis;
25using HeuristicLab.Core;
26using HeuristicLab.Data;
27using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
28using HeuristicLab.Operators;
29using HeuristicLab.Optimization;
30using HeuristicLab.Parameters;
31using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
32using HeuristicLab.Problems.DataAnalysis.Evaluators;
33using HeuristicLab.Problems.DataAnalysis.Symbolic;
34using System;
35
36namespace HeuristicLab.Problems.DataAnalysis.Regression.Symbolic.Analyzers {
37  /// <summary>
38  /// An operator that analyzes the validation best scaled symbolic regression solution.
39  /// </summary>
40  [Item("FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer", "An operator that analyzes the validation best scaled symbolic regression solution.")]
41  [StorableClass]
42  public sealed class FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer : SingleSuccessorOperator, ISymbolicRegressionAnalyzer {
43    private const string RandomParameterName = "Random";
44    private const string SymbolicExpressionTreeParameterName = "SymbolicExpressionTree";
45    private const string SymbolicExpressionTreeInterpreterParameterName = "SymbolicExpressionTreeInterpreter";
46    private const string ProblemDataParameterName = "ProblemData";
47    private const string ValidationSamplesStartParameterName = "SamplesStart";
48    private const string ValidationSamplesEndParameterName = "SamplesEnd";
49    // private const string QualityParameterName = "Quality";
50    private const string UpperEstimationLimitParameterName = "UpperEstimationLimit";
51    private const string LowerEstimationLimitParameterName = "LowerEstimationLimit";
52    private const string EvaluatorParameterName = "Evaluator";
53    private const string MaximizationParameterName = "Maximization";
54    private const string BestSolutionParameterName = "Best solution (validation)";
55    private const string BestSolutionQualityParameterName = "Best solution quality (validation)";
56    private const string CurrentBestValidationQualityParameterName = "Current best validation quality";
57    private const string BestSolutionQualityValuesParameterName = "Validation Quality";
58    private const string ResultsParameterName = "Results";
59    private const string VariableFrequenciesParameterName = "VariableFrequencies";
60    private const string BestKnownQualityParameterName = "BestKnownQuality";
61    private const string GenerationsParameterName = "Generations";
62    private const string RelativeNumberOfEvaluatedSamplesParameterName = "RelativeNumberOfEvaluatedSamples";
63
64    private const string TrainingMeanSquaredErrorQualityParameterName = "Mean squared error (training)";
65    private const string MinTrainingMeanSquaredErrorQualityParameterName = "Min mean squared error (training)";
66    private const string MaxTrainingMeanSquaredErrorQualityParameterName = "Max mean squared error (training)";
67    private const string AverageTrainingMeanSquaredErrorQualityParameterName = "Average mean squared error (training)";
68    private const string BestTrainingMeanSquaredErrorQualityParameterName = "Best mean squared error (training)";
69
70    private const string TrainingAverageRelativeErrorQualityParameterName = "Average relative error (training)";
71    private const string MinTrainingAverageRelativeErrorQualityParameterName = "Min average relative error (training)";
72    private const string MaxTrainingAverageRelativeErrorQualityParameterName = "Max average relative error (training)";
73    private const string AverageTrainingAverageRelativeErrorQualityParameterName = "Average average relative error (training)";
74    private const string BestTrainingAverageRelativeErrorQualityParameterName = "Best average relative error (training)";
75
76    private const string TrainingRSquaredQualityParameterName = "R² (training)";
77    private const string MinTrainingRSquaredQualityParameterName = "Min R² (training)";
78    private const string MaxTrainingRSquaredQualityParameterName = "Max R² (training)";
79    private const string AverageTrainingRSquaredQualityParameterName = "Average R² (training)";
80    private const string BestTrainingRSquaredQualityParameterName = "Best R² (training)";
81
82    private const string TestMeanSquaredErrorQualityParameterName = "Mean squared error (test)";
83    private const string MinTestMeanSquaredErrorQualityParameterName = "Min mean squared error (test)";
84    private const string MaxTestMeanSquaredErrorQualityParameterName = "Max mean squared error (test)";
85    private const string AverageTestMeanSquaredErrorQualityParameterName = "Average mean squared error (test)";
86    private const string BestTestMeanSquaredErrorQualityParameterName = "Best mean squared error (test)";
87
88    private const string TestAverageRelativeErrorQualityParameterName = "Average relative error (test)";
89    private const string MinTestAverageRelativeErrorQualityParameterName = "Min average relative error (test)";
90    private const string MaxTestAverageRelativeErrorQualityParameterName = "Max average relative error (test)";
91    private const string AverageTestAverageRelativeErrorQualityParameterName = "Average average relative error (test)";
92    private const string BestTestAverageRelativeErrorQualityParameterName = "Best average relative error (test)";
93
94    private const string TestRSquaredQualityParameterName = "R² (test)";
95    private const string MinTestRSquaredQualityParameterName = "Min R² (test)";
96    private const string MaxTestRSquaredQualityParameterName = "Max R² (test)";
97    private const string AverageTestRSquaredQualityParameterName = "Average R² (test)";
98    private const string BestTestRSquaredQualityParameterName = "Best R² (test)";
99
100    private const string RSquaredValuesParameterName = "R²";
101    private const string MeanSquaredErrorValuesParameterName = "Mean squared error";
102    private const string RelativeErrorValuesParameterName = "Average relative error";
103
104    #region parameter properties
105    public ILookupParameter<IRandom> RandomParameter {
106      get { return (ILookupParameter<IRandom>)Parameters[RandomParameterName]; }
107    }
108    public ScopeTreeLookupParameter<SymbolicExpressionTree> SymbolicExpressionTreeParameter {
109      get { return (ScopeTreeLookupParameter<SymbolicExpressionTree>)Parameters[SymbolicExpressionTreeParameterName]; }
110    }
111    public IValueLookupParameter<ISymbolicExpressionTreeInterpreter> SymbolicExpressionTreeInterpreterParameter {
112      get { return (IValueLookupParameter<ISymbolicExpressionTreeInterpreter>)Parameters[SymbolicExpressionTreeInterpreterParameterName]; }
113    }
114    public ILookupParameter<ISymbolicRegressionEvaluator> EvaluatorParameter {
115      get { return (ILookupParameter<ISymbolicRegressionEvaluator>)Parameters[EvaluatorParameterName]; }
116    }
117    public ILookupParameter<BoolValue> MaximizationParameter {
118      get { return (ILookupParameter<BoolValue>)Parameters[MaximizationParameterName]; }
119    }
120    public IValueLookupParameter<DataAnalysisProblemData> ProblemDataParameter {
121      get { return (IValueLookupParameter<DataAnalysisProblemData>)Parameters[ProblemDataParameterName]; }
122    }
123    public IValueLookupParameter<IntValue> ValidationSamplesStartParameter {
124      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesStartParameterName]; }
125    }
126    public IValueLookupParameter<IntValue> ValidationSamplesEndParameter {
127      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesEndParameterName]; }
128    }
129    public IValueParameter<PercentValue> RelativeNumberOfEvaluatedSamplesParameter {
130      get { return (IValueParameter<PercentValue>)Parameters[RelativeNumberOfEvaluatedSamplesParameterName]; }
131    }
132
133    public IValueLookupParameter<DoubleValue> UpperEstimationLimitParameter {
134      get { return (IValueLookupParameter<DoubleValue>)Parameters[UpperEstimationLimitParameterName]; }
135    }
136    public IValueLookupParameter<DoubleValue> LowerEstimationLimitParameter {
137      get { return (IValueLookupParameter<DoubleValue>)Parameters[LowerEstimationLimitParameterName]; }
138    }
139    public ILookupParameter<SymbolicRegressionSolution> BestSolutionParameter {
140      get { return (ILookupParameter<SymbolicRegressionSolution>)Parameters[BestSolutionParameterName]; }
141    }
142    public ILookupParameter<IntValue> GenerationsParameter {
143      get { return (ILookupParameter<IntValue>)Parameters[GenerationsParameterName]; }
144    }
145    public ILookupParameter<DoubleValue> BestSolutionQualityParameter {
146      get { return (ILookupParameter<DoubleValue>)Parameters[BestSolutionQualityParameterName]; }
147    }
148    public ILookupParameter<DataTable> BestSolutionQualityValuesParameter {
149      get { return (ILookupParameter<DataTable>)Parameters[BestSolutionQualityValuesParameterName]; }
150    }
151    public ILookupParameter<ResultCollection> ResultsParameter {
152      get { return (ILookupParameter<ResultCollection>)Parameters[ResultsParameterName]; }
153    }
154    public ILookupParameter<DoubleValue> BestKnownQualityParameter {
155      get { return (ILookupParameter<DoubleValue>)Parameters[BestKnownQualityParameterName]; }
156    }
157    public ILookupParameter<DoubleValue> CurrentBestValidationQualityParameter {
158      get { return (ILookupParameter<DoubleValue>)Parameters[CurrentBestValidationQualityParameterName]; }
159    }
160
161    public ILookupParameter<DataTable> VariableFrequenciesParameter {
162      get { return (ILookupParameter<DataTable>)Parameters[VariableFrequenciesParameterName]; }
163    }
164
165    #endregion
166    #region properties
167    public IRandom Random {
168      get { return RandomParameter.ActualValue; }
169    }
170    public ItemArray<SymbolicExpressionTree> SymbolicExpressionTree {
171      get { return SymbolicExpressionTreeParameter.ActualValue; }
172    }
173    public ISymbolicExpressionTreeInterpreter SymbolicExpressionTreeInterpreter {
174      get { return SymbolicExpressionTreeInterpreterParameter.ActualValue; }
175    }
176    public ISymbolicRegressionEvaluator Evaluator {
177      get { return EvaluatorParameter.ActualValue; }
178    }
179    public BoolValue Maximization {
180      get { return MaximizationParameter.ActualValue; }
181    }
182    public DataAnalysisProblemData ProblemData {
183      get { return ProblemDataParameter.ActualValue; }
184    }
185    public IntValue ValidiationSamplesStart {
186      get { return ValidationSamplesStartParameter.ActualValue; }
187    }
188    public IntValue ValidationSamplesEnd {
189      get { return ValidationSamplesEndParameter.ActualValue; }
190    }
191    public PercentValue RelativeNumberOfEvaluatedSamples {
192      get { return RelativeNumberOfEvaluatedSamplesParameter.Value; }
193    }
194
195    public DoubleValue UpperEstimationLimit {
196      get { return UpperEstimationLimitParameter.ActualValue; }
197    }
198    public DoubleValue LowerEstimationLimit {
199      get { return LowerEstimationLimitParameter.ActualValue; }
200    }
201    public ResultCollection Results {
202      get { return ResultsParameter.ActualValue; }
203    }
204    public DataTable VariableFrequencies {
205      get { return VariableFrequenciesParameter.ActualValue; }
206    }
207    public IntValue Generations {
208      get { return GenerationsParameter.ActualValue; }
209    }
210    public DoubleValue BestSolutionQuality {
211      get { return BestSolutionQualityParameter.ActualValue; }
212    }
213
214    #endregion
215
216    public FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer()
217      : base() {
218      Parameters.Add(new LookupParameter<IRandom>(RandomParameterName, "The random generator to use."));
219      Parameters.Add(new LookupParameter<ISymbolicRegressionEvaluator>(EvaluatorParameterName, "The evaluator which should be used to evaluate the solution on the validation set."));
220      Parameters.Add(new ScopeTreeLookupParameter<SymbolicExpressionTree>(SymbolicExpressionTreeParameterName, "The symbolic expression trees to analyze."));
221      Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
222      Parameters.Add(new ValueLookupParameter<ISymbolicExpressionTreeInterpreter>(SymbolicExpressionTreeInterpreterParameterName, "The interpreter that should be used for the analysis of symbolic expression trees."));
223      Parameters.Add(new ValueLookupParameter<DataAnalysisProblemData>(ProblemDataParameterName, "The problem data for which the symbolic expression tree is a solution."));
224      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesStartParameterName, "The first index of the validation partition of the data set."));
225      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesEndParameterName, "The last index of the validation partition of the data set."));
226      Parameters.Add(new ValueParameter<PercentValue>(RelativeNumberOfEvaluatedSamplesParameterName, "The relative number of samples of the dataset partition, which should be randomly chosen for evaluation between the start and end index.", new PercentValue(1)));
227      Parameters.Add(new ValueLookupParameter<DoubleValue>(UpperEstimationLimitParameterName, "The upper estimation limit that was set for the evaluation of the symbolic expression trees."));
228      Parameters.Add(new ValueLookupParameter<DoubleValue>(LowerEstimationLimitParameterName, "The lower estimation limit that was set for the evaluation of the symbolic expression trees."));
229      Parameters.Add(new LookupParameter<SymbolicRegressionSolution>(BestSolutionParameterName, "The best symbolic regression solution."));
230      Parameters.Add(new LookupParameter<IntValue>(GenerationsParameterName, "The number of generations calculated so far."));
231      Parameters.Add(new LookupParameter<DoubleValue>(BestSolutionQualityParameterName, "The quality of the best symbolic regression solution."));
232      Parameters.Add(new LookupParameter<ResultCollection>(ResultsParameterName, "The result collection where the best symbolic regression solution should be stored."));
233      Parameters.Add(new LookupParameter<DoubleValue>(BestKnownQualityParameterName, "The best known (validation) quality achieved on the data set."));
234      Parameters.Add(new LookupParameter<DoubleValue>(CurrentBestValidationQualityParameterName, "The quality of the best solution (on the validation set) of the current generation."));
235      Parameters.Add(new LookupParameter<DataTable>(BestSolutionQualityValuesParameterName));
236      Parameters.Add(new LookupParameter<DataTable>(VariableFrequenciesParameterName, "The variable frequencies table to use for the calculation of variable impacts"));
237    }
238
239    [StorableConstructor]
240    private FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer(bool deserializing) : base() { }
241
242    [StorableHook(HookType.AfterDeserialization)]
243    private void AfterDeserialization() {
244      #region compatibility remove before releasing 3.3.1
245      if (!Parameters.ContainsKey(EvaluatorParameterName)) {
246        Parameters.Add(new LookupParameter<ISymbolicRegressionEvaluator>(EvaluatorParameterName, "The evaluator which should be used to evaluate the solution on the validation set."));
247      }
248      if (!Parameters.ContainsKey(MaximizationParameterName)) {
249        Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
250      }
251      if (!Parameters.ContainsKey(BestSolutionQualityValuesParameterName)) {
252        Parameters.Add(new LookupParameter<DataTable>(BestSolutionQualityValuesParameterName));
253      }
254      #endregion
255    }
256
257    public override IOperation Apply() {
258      var trees = SymbolicExpressionTree;
259
260      string targetVariable = ProblemData.TargetVariable.Value;
261
262      // select a random subset of rows in the validation set
263      int validationStart = ValidiationSamplesStart.Value;
264      int validationEnd = ValidationSamplesEnd.Value;
265      int seed = Random.Next();
266      int count = (int)((validationEnd - validationStart) * RelativeNumberOfEvaluatedSamples.Value);
267      if (count == 0) count = 1;
268      IEnumerable<int> rows = RandomEnumerable.SampleRandomNumbers(seed, validationStart, validationEnd, count);
269
270      double upperEstimationLimit = UpperEstimationLimit != null ? UpperEstimationLimit.Value : double.PositiveInfinity;
271      double lowerEstimationLimit = LowerEstimationLimit != null ? LowerEstimationLimit.Value : double.NegativeInfinity;
272
273      double bestQuality = Maximization.Value ? double.NegativeInfinity : double.PositiveInfinity;
274      SymbolicExpressionTree bestTree = null;
275
276      foreach (var tree in trees) {
277        double quality = Evaluator.Evaluate(SymbolicExpressionTreeInterpreter, tree,
278          lowerEstimationLimit, upperEstimationLimit,
279          ProblemData.Dataset, targetVariable,
280         rows);
281
282        if ((Maximization.Value && quality > bestQuality) ||
283            (!Maximization.Value && quality < bestQuality)) {
284          bestQuality = quality;
285          bestTree = tree;
286        }
287      }
288
289      // if the best validation tree is better than the current best solution => update
290      bool newBest =
291        BestSolutionQuality == null ||
292        (Maximization.Value && bestQuality > BestSolutionQuality.Value) ||
293        (!Maximization.Value && bestQuality < BestSolutionQuality.Value);
294      if (newBest) {
295        // calculate scaling parameters and only for the best tree using the full training set
296        double alpha, beta;
297        int trainingStart = ProblemData.TrainingSamplesStart.Value;
298        int trainingEnd = ProblemData.TrainingSamplesEnd.Value;
299        IEnumerable<int> trainingRows = Enumerable.Range(trainingStart, trainingEnd - trainingStart);
300        IEnumerable<double> originalValues = ProblemData.Dataset.GetEnumeratedVariableValues(targetVariable, trainingRows);
301        IEnumerable<double> estimatedValues = SymbolicExpressionTreeInterpreter.GetSymbolicExpressionTreeValues(bestTree, ProblemData.Dataset, trainingRows);
302
303        SymbolicRegressionScaledMeanSquaredErrorEvaluator.CalculateScalingParameters(originalValues, estimatedValues, out beta, out alpha);
304
305        // scale tree for solution
306        var scaledTree = SymbolicRegressionSolutionLinearScaler.Scale(bestTree, alpha, beta);
307        var model = new SymbolicRegressionModel((ISymbolicExpressionTreeInterpreter)SymbolicExpressionTreeInterpreter.Clone(),
308          scaledTree);
309        var solution = new SymbolicRegressionSolution(ProblemData, model, lowerEstimationLimit, upperEstimationLimit);
310        solution.Name = BestSolutionParameterName;
311        solution.Description = "Best solution on validation partition found over the whole run.";
312
313        BestSolutionParameter.ActualValue = solution;
314        BestSolutionQualityParameter.ActualValue = new DoubleValue(bestQuality);
315
316        BestSymbolicRegressionSolutionAnalyzer.UpdateBestSolutionResults(solution, ProblemData, Results, Generations, VariableFrequencies);
317      }
318
319      CurrentBestValidationQualityParameter.ActualValue = new DoubleValue(bestQuality);
320
321      if (!Results.ContainsKey(BestSolutionQualityValuesParameterName)) {
322        Results.Add(new Result(BestSolutionQualityValuesParameterName, new DataTable(BestSolutionQualityValuesParameterName, BestSolutionQualityValuesParameterName)));
323        Results.Add(new Result(BestSolutionQualityParameterName, new DoubleValue()));
324        Results.Add(new Result(CurrentBestValidationQualityParameterName, new DoubleValue()));
325      }
326      Results[BestSolutionQualityParameterName].Value = new DoubleValue(BestSolutionQualityParameter.ActualValue.Value);
327      Results[CurrentBestValidationQualityParameterName].Value = new DoubleValue(bestQuality);
328
329      DataTable validationValues = (DataTable)Results[BestSolutionQualityValuesParameterName].Value;
330      AddValue(validationValues, BestSolutionQualityParameter.ActualValue.Value, BestSolutionQualityParameterName, BestSolutionQualityParameterName);
331      AddValue(validationValues, bestQuality, CurrentBestValidationQualityParameterName, CurrentBestValidationQualityParameterName);
332
333      BestSolutionQualityValuesParameter.ActualValue = validationValues;
334     
335      return base.Apply();
336    }
337
338    [StorableHook(HookType.AfterDeserialization)]
339    private void Initialize() { }
340
341    private static void AddValue(DataTable table, double data, string name, string description) {
342      DataRow row;
343      table.Rows.TryGetValue(name, out row);
344      if (row == null) {
345        row = new DataRow(name, description);
346        row.Values.Add(data);
347        table.Rows.Add(row);
348      } else {
349        row.Values.Add(data);
350      }
351    }
352  }
353}
Note: See TracBrowser for help on using the repository browser.