Free cookie consent management tool by TermsFeed Policy Generator

source: branches/CloningRefactoring/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer.cs @ 4656

Last change on this file since 4656 was 4468, checked in by mkommend, 14 years ago

Preparation for cross validation - removed the test samples from the trainining samples and added ValidationPercentage parameter (ticket #1199).

File size: 16.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System.Collections.Generic;
23using System.Linq;
24using HeuristicLab.Analysis;
25using HeuristicLab.Core;
26using HeuristicLab.Data;
27using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
28using HeuristicLab.Operators;
29using HeuristicLab.Optimization;
30using HeuristicLab.Parameters;
31using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
32using HeuristicLab.Problems.DataAnalysis.Symbolic;
33
34namespace HeuristicLab.Problems.DataAnalysis.Regression.Symbolic.Analyzers {
35  /// <summary>
36  /// An operator that analyzes the validation best scaled symbolic regression solution.
37  /// </summary>
38  [Item("FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer", "An operator that analyzes the validation best scaled symbolic regression solution.")]
39  [StorableClass]
40  public sealed class FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer : SingleSuccessorOperator, ISymbolicRegressionAnalyzer {
41    private const string RandomParameterName = "Random";
42    private const string SymbolicExpressionTreeParameterName = "SymbolicExpressionTree";
43    private const string SymbolicExpressionTreeInterpreterParameterName = "SymbolicExpressionTreeInterpreter";
44    private const string ProblemDataParameterName = "ProblemData";
45    private const string ValidationSamplesStartParameterName = "SamplesStart";
46    private const string ValidationSamplesEndParameterName = "SamplesEnd";
47    // private const string QualityParameterName = "Quality";
48    private const string UpperEstimationLimitParameterName = "UpperEstimationLimit";
49    private const string LowerEstimationLimitParameterName = "LowerEstimationLimit";
50    private const string EvaluatorParameterName = "Evaluator";
51    private const string MaximizationParameterName = "Maximization";
52    private const string BestSolutionParameterName = "Best solution (validation)";
53    private const string BestSolutionQualityParameterName = "Best solution quality (validation)";
54    private const string CurrentBestValidationQualityParameterName = "Current best validation quality";
55    private const string BestSolutionQualityValuesParameterName = "Validation Quality";
56    private const string ResultsParameterName = "Results";
57    private const string VariableFrequenciesParameterName = "VariableFrequencies";
58    private const string BestKnownQualityParameterName = "BestKnownQuality";
59    private const string GenerationsParameterName = "Generations";
60    private const string RelativeNumberOfEvaluatedSamplesParameterName = "RelativeNumberOfEvaluatedSamples";
61
62    #region parameter properties
63    public ILookupParameter<IRandom> RandomParameter {
64      get { return (ILookupParameter<IRandom>)Parameters[RandomParameterName]; }
65    }
66    public ScopeTreeLookupParameter<SymbolicExpressionTree> SymbolicExpressionTreeParameter {
67      get { return (ScopeTreeLookupParameter<SymbolicExpressionTree>)Parameters[SymbolicExpressionTreeParameterName]; }
68    }
69    public IValueLookupParameter<ISymbolicExpressionTreeInterpreter> SymbolicExpressionTreeInterpreterParameter {
70      get { return (IValueLookupParameter<ISymbolicExpressionTreeInterpreter>)Parameters[SymbolicExpressionTreeInterpreterParameterName]; }
71    }
72    public ILookupParameter<ISymbolicRegressionEvaluator> EvaluatorParameter {
73      get { return (ILookupParameter<ISymbolicRegressionEvaluator>)Parameters[EvaluatorParameterName]; }
74    }
75    public ILookupParameter<BoolValue> MaximizationParameter {
76      get { return (ILookupParameter<BoolValue>)Parameters[MaximizationParameterName]; }
77    }
78    public IValueLookupParameter<DataAnalysisProblemData> ProblemDataParameter {
79      get { return (IValueLookupParameter<DataAnalysisProblemData>)Parameters[ProblemDataParameterName]; }
80    }
81    public IValueLookupParameter<IntValue> ValidationSamplesStartParameter {
82      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesStartParameterName]; }
83    }
84    public IValueLookupParameter<IntValue> ValidationSamplesEndParameter {
85      get { return (IValueLookupParameter<IntValue>)Parameters[ValidationSamplesEndParameterName]; }
86    }
87    public IValueParameter<PercentValue> RelativeNumberOfEvaluatedSamplesParameter {
88      get { return (IValueParameter<PercentValue>)Parameters[RelativeNumberOfEvaluatedSamplesParameterName]; }
89    }
90
91    public IValueLookupParameter<DoubleValue> UpperEstimationLimitParameter {
92      get { return (IValueLookupParameter<DoubleValue>)Parameters[UpperEstimationLimitParameterName]; }
93    }
94    public IValueLookupParameter<DoubleValue> LowerEstimationLimitParameter {
95      get { return (IValueLookupParameter<DoubleValue>)Parameters[LowerEstimationLimitParameterName]; }
96    }
97    public ILookupParameter<SymbolicRegressionSolution> BestSolutionParameter {
98      get { return (ILookupParameter<SymbolicRegressionSolution>)Parameters[BestSolutionParameterName]; }
99    }
100    public ILookupParameter<IntValue> GenerationsParameter {
101      get { return (ILookupParameter<IntValue>)Parameters[GenerationsParameterName]; }
102    }
103    public ILookupParameter<DoubleValue> BestSolutionQualityParameter {
104      get { return (ILookupParameter<DoubleValue>)Parameters[BestSolutionQualityParameterName]; }
105    }
106    public ILookupParameter<ResultCollection> ResultsParameter {
107      get { return (ILookupParameter<ResultCollection>)Parameters[ResultsParameterName]; }
108    }
109    public ILookupParameter<DoubleValue> BestKnownQualityParameter {
110      get { return (ILookupParameter<DoubleValue>)Parameters[BestKnownQualityParameterName]; }
111    }
112    public ILookupParameter<DataTable> VariableFrequenciesParameter {
113      get { return (ILookupParameter<DataTable>)Parameters[VariableFrequenciesParameterName]; }
114    }
115
116    #endregion
117    #region properties
118    public IRandom Random {
119      get { return RandomParameter.ActualValue; }
120    }
121    public ItemArray<SymbolicExpressionTree> SymbolicExpressionTree {
122      get { return SymbolicExpressionTreeParameter.ActualValue; }
123    }
124    public ISymbolicExpressionTreeInterpreter SymbolicExpressionTreeInterpreter {
125      get { return SymbolicExpressionTreeInterpreterParameter.ActualValue; }
126    }
127    public ISymbolicRegressionEvaluator Evaluator {
128      get { return EvaluatorParameter.ActualValue; }
129    }
130    public BoolValue Maximization {
131      get { return MaximizationParameter.ActualValue; }
132    }
133    public DataAnalysisProblemData ProblemData {
134      get { return ProblemDataParameter.ActualValue; }
135    }
136    public IntValue ValidiationSamplesStart {
137      get { return ValidationSamplesStartParameter.ActualValue; }
138    }
139    public IntValue ValidationSamplesEnd {
140      get { return ValidationSamplesEndParameter.ActualValue; }
141    }
142    public PercentValue RelativeNumberOfEvaluatedSamples {
143      get { return RelativeNumberOfEvaluatedSamplesParameter.Value; }
144    }
145
146    public DoubleValue UpperEstimationLimit {
147      get { return UpperEstimationLimitParameter.ActualValue; }
148    }
149    public DoubleValue LowerEstimationLimit {
150      get { return LowerEstimationLimitParameter.ActualValue; }
151    }
152    public ResultCollection Results {
153      get { return ResultsParameter.ActualValue; }
154    }
155    public DataTable VariableFrequencies {
156      get { return VariableFrequenciesParameter.ActualValue; }
157    }
158    public IntValue Generations {
159      get { return GenerationsParameter.ActualValue; }
160    }
161    public DoubleValue BestSolutionQuality {
162      get { return BestSolutionQualityParameter.ActualValue; }
163    }
164
165    #endregion
166
167    public FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer()
168      : base() {
169      Parameters.Add(new LookupParameter<IRandom>(RandomParameterName, "The random generator to use."));
170      Parameters.Add(new LookupParameter<ISymbolicRegressionEvaluator>(EvaluatorParameterName, "The evaluator which should be used to evaluate the solution on the validation set."));
171      Parameters.Add(new ScopeTreeLookupParameter<SymbolicExpressionTree>(SymbolicExpressionTreeParameterName, "The symbolic expression trees to analyze."));
172      Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
173      Parameters.Add(new ValueLookupParameter<ISymbolicExpressionTreeInterpreter>(SymbolicExpressionTreeInterpreterParameterName, "The interpreter that should be used for the analysis of symbolic expression trees."));
174      Parameters.Add(new ValueLookupParameter<DataAnalysisProblemData>(ProblemDataParameterName, "The problem data for which the symbolic expression tree is a solution."));
175      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesStartParameterName, "The first index of the validation partition of the data set."));
176      Parameters.Add(new ValueLookupParameter<IntValue>(ValidationSamplesEndParameterName, "The last index of the validation partition of the data set."));
177      Parameters.Add(new ValueParameter<PercentValue>(RelativeNumberOfEvaluatedSamplesParameterName, "The relative number of samples of the dataset partition, which should be randomly chosen for evaluation between the start and end index.", new PercentValue(1)));
178      Parameters.Add(new ValueLookupParameter<DoubleValue>(UpperEstimationLimitParameterName, "The upper estimation limit that was set for the evaluation of the symbolic expression trees."));
179      Parameters.Add(new ValueLookupParameter<DoubleValue>(LowerEstimationLimitParameterName, "The lower estimation limit that was set for the evaluation of the symbolic expression trees."));
180      Parameters.Add(new LookupParameter<SymbolicRegressionSolution>(BestSolutionParameterName, "The best symbolic regression solution."));
181      Parameters.Add(new LookupParameter<IntValue>(GenerationsParameterName, "The number of generations calculated so far."));
182      Parameters.Add(new LookupParameter<DoubleValue>(BestSolutionQualityParameterName, "The quality of the best symbolic regression solution."));
183      Parameters.Add(new LookupParameter<ResultCollection>(ResultsParameterName, "The result collection where the best symbolic regression solution should be stored."));
184      Parameters.Add(new LookupParameter<DoubleValue>(BestKnownQualityParameterName, "The best known (validation) quality achieved on the data set."));
185      Parameters.Add(new LookupParameter<DataTable>(VariableFrequenciesParameterName, "The variable frequencies table to use for the calculation of variable impacts"));
186    }
187
188    [StorableConstructor]
189    private FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer(bool deserializing) : base() { }
190
191    [StorableHook(HookType.AfterDeserialization)]
192    private void AfterDeserialization() {
193      #region compatibility remove before releasing 3.3.1
194      if (!Parameters.ContainsKey(EvaluatorParameterName)) {
195        Parameters.Add(new LookupParameter<ISymbolicRegressionEvaluator>(EvaluatorParameterName, "The evaluator which should be used to evaluate the solution on the validation set."));
196      }
197      if (!Parameters.ContainsKey(MaximizationParameterName)) {
198        Parameters.Add(new LookupParameter<BoolValue>(MaximizationParameterName, "The direction of optimization."));
199      }
200      #endregion
201    }
202
203    public override IOperation Apply() {
204      var trees = SymbolicExpressionTree;
205
206      string targetVariable = ProblemData.TargetVariable.Value;
207
208      // select a random subset of rows in the validation set
209      int validationStart = ValidiationSamplesStart.Value;
210      int validationEnd = ValidationSamplesEnd.Value;
211      int seed = Random.Next();
212      int count = (int)((validationEnd - validationStart) * RelativeNumberOfEvaluatedSamples.Value);
213      if (count == 0) count = 1;
214      IEnumerable<int> rows = RandomEnumerable.SampleRandomNumbers(seed, validationStart, validationEnd, count)
215        .Where(row => row < ProblemData.TestSamplesStart.Value || ProblemData.TestSamplesEnd.Value <= row);
216
217      double upperEstimationLimit = UpperEstimationLimit != null ? UpperEstimationLimit.Value : double.PositiveInfinity;
218      double lowerEstimationLimit = LowerEstimationLimit != null ? LowerEstimationLimit.Value : double.NegativeInfinity;
219
220      double bestQuality = Maximization.Value ? double.NegativeInfinity : double.PositiveInfinity;
221      SymbolicExpressionTree bestTree = null;
222
223      foreach (var tree in trees) {
224        double quality = Evaluator.Evaluate(SymbolicExpressionTreeInterpreter, tree,
225          lowerEstimationLimit, upperEstimationLimit,
226          ProblemData.Dataset, targetVariable,
227         rows);
228
229        if ((Maximization.Value && quality > bestQuality) ||
230            (!Maximization.Value && quality < bestQuality)) {
231          bestQuality = quality;
232          bestTree = tree;
233        }
234      }
235
236      // if the best validation tree is better than the current best solution => update
237      bool newBest =
238        BestSolutionQuality == null ||
239        (Maximization.Value && bestQuality > BestSolutionQuality.Value) ||
240        (!Maximization.Value && bestQuality < BestSolutionQuality.Value);
241      if (newBest) {
242        // calculate scaling parameters and only for the best tree using the full training set
243        double alpha, beta;
244        SymbolicRegressionScaledMeanSquaredErrorEvaluator.Calculate(SymbolicExpressionTreeInterpreter, bestTree,
245          lowerEstimationLimit, upperEstimationLimit,
246          ProblemData.Dataset, targetVariable,
247          ProblemData.TrainingIndizes, out beta, out alpha);
248
249        // scale tree for solution
250        var scaledTree = SymbolicRegressionSolutionLinearScaler.Scale(bestTree, alpha, beta);
251        var model = new SymbolicRegressionModel((ISymbolicExpressionTreeInterpreter)SymbolicExpressionTreeInterpreter.Clone(),
252          scaledTree);
253        var solution = new SymbolicRegressionSolution((DataAnalysisProblemData)ProblemData.Clone(), model, lowerEstimationLimit, upperEstimationLimit);
254        solution.Name = BestSolutionParameterName;
255        solution.Description = "Best solution on validation partition found over the whole run.";
256
257        BestSolutionParameter.ActualValue = solution;
258        BestSolutionQualityParameter.ActualValue = new DoubleValue(bestQuality);
259
260        BestSymbolicRegressionSolutionAnalyzer.UpdateBestSolutionResults(solution, ProblemData, Results, Generations, VariableFrequencies);
261      }
262
263
264      if (!Results.ContainsKey(BestSolutionQualityValuesParameterName)) {
265        Results.Add(new Result(BestSolutionQualityValuesParameterName, new DataTable(BestSolutionQualityValuesParameterName, BestSolutionQualityValuesParameterName)));
266        Results.Add(new Result(BestSolutionQualityParameterName, new DoubleValue()));
267        Results.Add(new Result(CurrentBestValidationQualityParameterName, new DoubleValue()));
268      }
269      Results[BestSolutionQualityParameterName].Value = new DoubleValue(BestSolutionQualityParameter.ActualValue.Value);
270      Results[CurrentBestValidationQualityParameterName].Value = new DoubleValue(bestQuality);
271
272      DataTable validationValues = (DataTable)Results[BestSolutionQualityValuesParameterName].Value;
273      AddValue(validationValues, BestSolutionQualityParameter.ActualValue.Value, BestSolutionQualityParameterName, BestSolutionQualityParameterName);
274      AddValue(validationValues, bestQuality, CurrentBestValidationQualityParameterName, CurrentBestValidationQualityParameterName);
275      return base.Apply();
276    }
277
278    [StorableHook(HookType.AfterDeserialization)]
279    private void Initialize() { }
280
281    private static void AddValue(DataTable table, double data, string name, string description) {
282      DataRow row;
283      table.Rows.TryGetValue(name, out row);
284      if (row == null) {
285        row = new DataRow(name, description);
286        row.Values.Add(data);
287        table.Rows.Add(row);
288      } else {
289        row.Values.Add(data);
290      }
291    }
292  }
293}
Note: See TracBrowser for help on using the repository browser.