Free cookie consent management tool by TermsFeed Policy Generator

source: branches/3127-MRGP-VarPro-Exploration/HeuristicLab.Problems.VarProMRGP/3.3/Problem.cs @ 17988

Last change on this file since 17988 was 17988, checked in by gkronber, 3 years ago

#3127: initial import of VarProMRGP implementation (depends on new NativeInterpreter which supports VarPro)

File size: 38.0 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Linq;
24using HeuristicLab.Analysis;
25using HeuristicLab.Common;
26using HeuristicLab.Core;
27using HeuristicLab.Data;
28using HeuristicLab.Optimization;
29using HeuristicLab.Parameters;
30using HEAL.Attic;
31using HeuristicLab.Problems.Instances;
32using HeuristicLab.Problems.DataAnalysis;
33using HeuristicLab.Encodings.BinaryVectorEncoding;
34using HeuristicLab.Problems.DataAnalysis.Symbolic;
35using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
36using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;
37using System.Collections.Generic;
38using System.Runtime.InteropServices;
39using HeuristicLab.Random;
40using HeuristicLab.Analysis.Statistics;
41
42namespace HeuristicLab.Problems.VarProMRGP {
43  [Item("VarPro Multi-Regression Genetic Programming", "Similar to MRGP but MRGP is a inappropriate name, we should think about a new name.")]
44  [Creatable(CreatableAttribute.Categories.CombinatorialProblems, Priority = 999)]
45  [StorableType("8B84830E-0DEB-44FD-B7E8-6DA2F64C0FF2")]
46  public sealed class Problem : SingleObjectiveBasicProblem<BinaryVectorEncoding>, IProblemInstanceConsumer<IRegressionProblemData> {
47
48    public IValueParameter<IRegressionProblemData> RegressionProblemDataParameter => (IValueParameter<IRegressionProblemData>)Parameters["ProblemData"];
49    public IValueParameter<VarProGrammar> GrammarParameter => (IValueParameter<VarProGrammar>)Parameters["Grammar"];
50    public IFixedValueParameter<IntValue> MaxDepthParameter => (IFixedValueParameter<IntValue>)Parameters["MaxDepth"];
51    public IFixedValueParameter<IntValue> MaxSizeParameter => (IFixedValueParameter<IntValue>)Parameters["MaxSize"];
52    public OptionalValueParameter<ReadOnlyItemArray<StringValue>> FeaturesParameter => (OptionalValueParameter<ReadOnlyItemArray<StringValue>>)Parameters["Features"];
53    public OptionalValueParameter<BinaryVector> BestKnownSolutionParameter => (OptionalValueParameter<BinaryVector>)Parameters["BestKnownSolution"];
54    public IRegressionProblemData RegressionProblemData {
55      get => RegressionProblemDataParameter.Value;
56      set => RegressionProblemDataParameter.Value = value;
57    }
58    public VarProGrammar Grammar {
59      get => GrammarParameter.Value;
60      set => GrammarParameter.Value = value;
61    }
62
63    public int MaxSize {
64      get => MaxSizeParameter.Value.Value;
65      set => MaxSizeParameter.Value.Value = value;
66    }
67
68    public int MaxDepth {
69      get => MaxDepthParameter.Value.Value;
70      set => MaxDepthParameter.Value.Value = value;
71    }
72
73    public ReadOnlyItemArray<StringValue> Features {
74      get => FeaturesParameter.Value;
75      private set {
76        FeaturesParameter.ReadOnly = false;
77        FeaturesParameter.Value = value;
78        FeaturesParameter.ReadOnly = true;
79      }
80    }
81
82    public BinaryVector BestKnownSolution {
83      get => BestKnownSolutionParameter.Value;
84      set => BestKnownSolutionParameter.Value = value;
85    }
86
87
88    public override bool Maximization => false;
89    // public override bool[] Maximization => new[] { false, false };
90
91
92    #region not cloned or stored
93    private int[] featureIdx = null; // code[featureIdx[i]] is the last operation of feature i
94    [ExcludeFromObjectGraphTraversal]
95    private NativeInterpreter.NativeInstruction[] code = null;
96    private Dictionary<string, GCHandle> cachedData = null;
97    private LinearTransformation[] transformations;
98    #endregion
99
100
101    [StorableConstructor]
102    private Problem(StorableConstructorFlag _) : base(_) { }
103    private Problem(Problem original, Cloner cloner)
104      : base(original, cloner) {
105      RegisterEventHandlers();
106    }
107
108    public Problem() {
109      var g = new VarProGrammar();
110
111      Parameters.Add(new ValueParameter<IRegressionProblemData>("ProblemData", "", new RegressionProblemData()));
112      Parameters.Add(new ValueParameter<VarProGrammar>("Grammar", "", g));
113      Parameters.Add(new FixedValueParameter<IntValue>("MaxSize", "", new IntValue(10)));
114      Parameters.Add(new FixedValueParameter<IntValue>("MaxDepth", "", new IntValue(6)));
115      Parameters.Add(new OptionalValueParameter<ReadOnlyItemArray<StringValue>>("Features", "autogenerated"));
116      Parameters.Add(new OptionalValueParameter<BinaryVector>("BestKnownSolution", ""));
117      FeaturesParameter.ReadOnly = true;
118
119      Encoding = new BinaryVectorEncoding("b");
120      Encoding.Length = 10000; // default for number of features
121
122      g.ConfigureVariableSymbols(RegressionProblemData);
123
124      InitializeOperators();
125      RegisterEventHandlers();
126    }
127
128    public override IDeepCloneable Clone(Cloner cloner) {
129      return new Problem(this, cloner);
130    }
131
132
133    [StorableHook(HookType.AfterDeserialization)]
134    private void AfterDeserialization() {
135      RegisterEventHandlers();
136    }
137
138    #region event handling
139    // Dependencies of parameters and fields
140    // ProblemData
141    //   |
142    //   +------------------+
143    //   |                  |
144    // cachedData         Grammar             MaxSize           MaxDepth          MaxInteractions
145    // cachedDataSet        |                    |                 |                 |
146    //                      +--------------------+-----------------+-----------------+
147    //                      |
148    //                     Features
149    //                      Code
150    //                      |
151    //                     Encoding (Length)
152    //                      |
153    //                      +--------------------+
154    //                      |                    |
155    //                   BestKnownSolution      Operators (Parameter)
156    //                   BestKnownQuality
157
158    private void RegisterEventHandlers() {
159      RegressionProblemDataParameter.ValueChanged += RegressionProblemDataParameter_ValueChanged;
160      RegressionProblemData.Changed += RegressionProblemData_Changed;
161      GrammarParameter.ValueChanged += GrammarParameter_ValueChanged;
162      Grammar.Changed += Grammar_Changed;
163      MaxSizeParameter.Value.ValueChanged += Value_ValueChanged;
164      MaxDepthParameter.Value.ValueChanged += Value_ValueChanged;
165      FeaturesParameter.ValueChanged += FeaturesParameter_ValueChanged;
166    }
167
168    private void FeaturesParameter_ValueChanged(object sender, EventArgs e) {
169      if (Encoding.Length != Features.Length) {
170        Encoding.Length = Features.Length;
171        OnEncodingChanged();
172      }
173    }
174
175    private void Value_ValueChanged(object sender, EventArgs e) {
176      UpdateFeaturesAndCode();
177    }
178
179    private void Grammar_Changed(object sender, EventArgs e) {
180      UpdateFeaturesAndCode();
181    }
182
183    private void GrammarParameter_ValueChanged(object sender, EventArgs e) {
184      Grammar.Changed += Grammar_Changed;
185      UpdateFeaturesAndCode();
186    }
187
188    private void RegressionProblemData_Changed(object sender, EventArgs e) {
189      UpdateDataCache();
190      Grammar.ConfigureVariableSymbols(RegressionProblemData);
191    }
192
193    private void RegressionProblemDataParameter_ValueChanged(object sender, EventArgs e) {
194      RegressionProblemData.Changed += RegressionProblemData_Changed;
195      UpdateDataCache();
196      Grammar.ConfigureVariableSymbols(RegressionProblemData);
197    }
198
199    protected override void OnEncodingChanged() {
200      base.OnEncodingChanged();
201      OnReset();
202      ParameterizeOperators();
203    }
204
205    protected override void OnReset() {
206      base.OnReset();
207      BestKnownQualityParameter.ActualValue = null;
208      BestKnownSolutionParameter.ActualValue = null;
209    }
210
211    private void UpdateDataCache() {
212      if (cachedData != null) {
213        foreach (var gch in cachedData.Values) {
214          gch.Free();
215        }
216        cachedData = null;
217      }
218
219      var dataset = RegressionProblemData.Dataset;
220
221      // free handles to old data
222      if (cachedData != null) {
223        foreach (var gch in cachedData.Values) {
224          gch.Free();
225        }
226        cachedData = null;
227      }
228
229      // cache new data
230      cachedData = new Dictionary<string, GCHandle>();
231      transformations = new LinearTransformation[dataset.DoubleVariables.Count()];
232      int varIdx = 0;
233      foreach (var v in dataset.DoubleVariables) {
234        var values = dataset.GetDoubleValues(v).ToArray();
235        if (v == RegressionProblemData.TargetVariable) {
236          // do not scale target
237          var linTransform = new LinearTransformation(dataset.DoubleVariables);
238          linTransform.Addend = 0;
239          linTransform.Multiplier = 1.0;
240          transformations[varIdx++] = linTransform;
241        } else {
242          // Scale to 0 .. 1
243          var max = values.Max();
244          var min = values.Min();
245          var range = max - min;
246          var linTransform = new LinearTransformation(dataset.DoubleVariables);
247          transformations[varIdx++] = linTransform;
248          linTransform.Addend = -1 * min / range - 0.0;
249          linTransform.Multiplier = 1.0 / range;
250          for (int i = 0; i < values.Length; i++) {
251            values[i] = values[i] * linTransform.Multiplier + linTransform.Addend;
252          }
253        }
254        var gch = GCHandle.Alloc(values, GCHandleType.Pinned);
255        cachedData[v] = gch;
256      }
257    }
258
259    private void UpdateFeaturesAndCode() {
260      var features = GenerateFeaturesSystematic(10000, new MersenneTwister(31415), Grammar, MaxSize, MaxDepth, maxVariables: 3);
261      GenerateCode(features, RegressionProblemData.Dataset);
262      var formatter = new InfixExpressionFormatter();
263      Features = new ItemArray<StringValue>(features.Select(fi => new StringValue(formatter.Format(fi, System.Globalization.NumberFormatInfo.InvariantInfo, formatString: "0.0")).AsReadOnly())).AsReadOnly();
264    }
265
266
267    #endregion
268
269    public override double Evaluate(Individual individual, IRandom random) {
270      if (cachedData == null || code == null) {
271        // after loading from file or cloning the data cache and code must be restored
272        UpdateDataCache();
273        UpdateFeaturesAndCode();
274      }
275      var b = individual.BinaryVector(Encoding.Name);
276      var fittingOptions = new NativeInterpreter.SolverOptions();
277      fittingOptions.Iterations = 10;
278      fittingOptions.Algorithm = NativeInterpreter.Algorithm.Krogh;
279      fittingOptions.LinearSolver = NativeInterpreter.CeresTypes.LinearSolverType.DENSE_QR;
280      fittingOptions.Minimizer = NativeInterpreter.CeresTypes.MinimizerType.TRUST_REGION;
281      fittingOptions.TrustRegionStrategy = NativeInterpreter.CeresTypes.TrustRegionStrategyType.LEVENBERG_MARQUARDT;
282
283      var evalOptions = new NativeInterpreter.SolverOptions();
284      evalOptions.Iterations = 0;
285      evalOptions.Algorithm = NativeInterpreter.Algorithm.Krogh;
286      evalOptions.LinearSolver = NativeInterpreter.CeresTypes.LinearSolverType.DENSE_QR;
287      evalOptions.Minimizer = NativeInterpreter.CeresTypes.MinimizerType.TRUST_REGION;
288      evalOptions.TrustRegionStrategy = NativeInterpreter.CeresTypes.TrustRegionStrategyType.LEVENBERG_MARQUARDT;
289
290      var rows = RegressionProblemData.TrainingIndices.ToArray();
291
292      var allRows = rows.ToArray();
293      var nRows = allRows.Length;
294      var termIndexList = new List<int>();
295      for (int i = 0; i < b.Length; i++) {
296        if (b[i] == true) {
297          termIndexList.Add(featureIdx[i]);
298        }
299      }
300
301      var coefficients = new double[termIndexList.Count + 1]; // coefficients for all terms + offset
302                                                              // var avgCoefficients = new double[Features.Length];
303
304      // 5-fold CV
305      var nFolds = 5;
306      var nFoldRows = nRows / nFolds;
307      var fittingRows = new int[(nFolds - 1) * nFoldRows];
308      var fittingResult = new double[fittingRows.Length];
309
310      var testRows = new int[nFoldRows];
311      var testResult = new double[nFoldRows];
312      var cvMSE = new List<double>();
313      var trainMSE = new List<double>();
314      for (int testFold = 0; testFold < nFolds; testFold++) {
315        // TODO: THREAD SAFETY FOR PARALLEL EVALUATION
316        // CODE IS MANIPULATED
317        lock (code) {
318          var oldParameterValues = ExtractParameters(code, termIndexList);
319
320          #region fit to training folds
321          // copy rows from the four training folds
322          var nextFoldIdx = 0;
323          for (int fold = 0; fold < nFolds; fold++) {
324            if (fold == testFold) {
325              Array.Copy(allRows, fold * nFoldRows, testRows, 0, nFoldRows);
326            } else {
327              Array.Copy(allRows, fold * nFoldRows, fittingRows, nextFoldIdx, nFoldRows);
328              nextFoldIdx += nFoldRows;
329            }
330          }
331
332          var fittingTarget = RegressionProblemData.Dataset.GetDoubleValues(RegressionProblemData.TargetVariable, fittingRows).ToArray();
333
334          Array.Clear(fittingResult, 0, fittingResult.Length);
335          Array.Clear(coefficients, 0, coefficients.Length);
336
337
338          NativeInterpreter.NativeWrapper.GetValuesVarPro(code, code.Length, termIndexList.ToArray(), termIndexList.Count, fittingRows, fittingRows.Length,
339            coefficients, fittingOptions, fittingResult, fittingTarget, out var optSummary);
340          #endregion
341
342          if (optSummary.InitialCost < 0 || optSummary.FinalCost < 0) throw new InvalidProgramException();
343          trainMSE.Add(optSummary.FinalCost * 2.0 / fittingRows.Length); // ceres cost is 0.5 * sum of squared residuals
344                                                                         // average of all coefficients
345                                                                         // TODO return a statistic for the relevance of the term instead of the coefficient
346                                                                         // for (int k = 0; k < termIndexList.Count; k++) {
347                                                                         //   avgCoefficients[Array.IndexOf(featureIdx, termIndexList[k])] += coefficients[k] / nFolds; // TODO perf
348                                                                         // }
349
350          #region calculate output on test fold to determine CV MSE
351
352          // evaluate all terms for test rows
353          var sumOutput = new double[testResult.Length];
354          for (int r = 0; r < sumOutput.Length; r++) {
355            sumOutput[r] = coefficients[coefficients.Length - 1]; // offset
356          }
357
358          for (int k = 0; k < termIndexList.Count; k++) {
359            var termIdx = termIndexList[k];
360            // copy relevant part of the code
361            var termCode = new NativeInterpreter.NativeInstruction[code[termIdx].length];
362            Array.Copy(code, termIdx - termCode.Length + 1, termCode, 0, termCode.Length);
363            NativeInterpreter.NativeWrapper.GetValues(termCode, termCode.Length, testRows, testRows.Length, evalOptions, testResult, null, out var evalSummary);
364
365            for (int r = 0; r < sumOutput.Length; r++) {
366              sumOutput[r] += coefficients[k] * testResult[r];
367            }
368          }
369
370          var evalTarget = RegressionProblemData.Dataset.GetDoubleValues(RegressionProblemData.TargetVariable, testRows);
371          if (sumOutput.Any(d => double.IsNaN(d))) cvMSE.Add(evalTarget.VariancePop()); // variance of target is MSE of constant model
372          else cvMSE.Add(sumOutput.Zip(evalTarget, (ri, ti) => { var res = ti - ri; return res * res; }).Average());
373
374          #endregion
375
376          #region prepare for next varpro call
377          var newParameterValues = ExtractParameters(code, termIndexList);
378          // keep the updated parameter values only when the coefficient is significantly <> 0
379          for (int i = 0; i < termIndexList.Count; i++) {
380            // TODO would actually need to take variance of term into account
381            if (Math.Abs(coefficients[i]) > 1e-5) {
382              oldParameterValues[i] = newParameterValues[i];
383            }
384          }
385          UpdateParameter(code, termIndexList, oldParameterValues);
386        }
387        #endregion
388      }
389
390
391      individual["Train MSE (avg)"] = new DoubleValue(trainMSE.Average());
392      individual["Parameter"] = new DoubleArray(ExtractParameters(code, termIndexList).SelectMany(arr => arr).ToArray()); // store the parameter which worked for this individual for solution creation
393                                                                                                                          // individual["Coefficients"] = new DoubleArray(avgCoefficients); // for allele frequency analysis
394                                                                                                                          // return new double[] { avgCost, coefficients.Count(ci => Math.Abs(ci) > 1e-9) };
395                                                                                                                          // average of averages
396      individual["CV MSE (avg)"] = new DoubleValue(cvMSE.Average());
397      individual["CV MSE (stdev)"] = new DoubleValue(cvMSE.StandardDeviation());
398      return cvMSE.Average() + cvMSE.StandardDeviation();
399      // return new double[] { cvMSE.Average() + cvMSE.StandardDeviation(), termIndexList.Count };
400    }
401
402
403    public override void Analyze(Individual[] individuals, double[] qualities, ResultCollection results, IRandom random) {
404      base.Analyze(individuals, qualities, results, random);
405
406      var orderedIndividuals = individuals.Zip(qualities, (i, q) => new { Individual = i, Quality = q }).OrderBy(z => z.Quality);
407      var bestIndividual = orderedIndividuals.First().Individual;
408      var bestQ = orderedIndividuals.First().Quality;
409      if (double.IsNaN(BestKnownQuality) || bestQ < BestKnownQuality) {
410        BestKnownQuality = bestQ;
411        BestKnownSolution = bestIndividual.BinaryVector(Encoding.Name);
412      }
413
414      var curBestQuality = results.ContainsKey("BestQuality") ? ((DoubleValue)results["BestQuality"].Value).Value : double.NaN;
415      if (double.IsNaN(curBestQuality) || bestQ < curBestQuality) {
416        var bestVector = bestIndividual.BinaryVector(Encoding.Name);
417
418        var options = new NativeInterpreter.SolverOptions();
419        options.Iterations = 10; // optimize on full dataset
420        options.Algorithm = NativeInterpreter.Algorithm.Krogh;
421        options.LinearSolver = NativeInterpreter.CeresTypes.LinearSolverType.DENSE_QR;
422        options.Minimizer = NativeInterpreter.CeresTypes.MinimizerType.TRUST_REGION;
423        options.TrustRegionStrategy = NativeInterpreter.CeresTypes.TrustRegionStrategyType.LEVENBERG_MARQUARDT;
424
425        var rows = RegressionProblemData.TrainingIndices.ToArray();
426        var target = RegressionProblemData.TargetVariableTrainingValues.ToArray();
427
428        var rowsArray = rows.ToArray();
429        var nRows = rowsArray.Length;
430        var result = new double[nRows];
431        var termIndexList = new List<int>();
432        var predictorNames = new List<string>();
433        for (int i = 0; i < bestVector.Length; i++) {
434          if (bestVector[i] == true) {
435            termIndexList.Add(featureIdx[i]);
436            predictorNames.Add(Features[i].Value);
437          }
438        }
439        var coefficients = new double[termIndexList.Count + 1]; // coefficients for all terms + offset
440                                                                // TODO: thread-safety
441        lock (code) {
442          UpdateParameter(code, termIndexList, ((DoubleArray)bestIndividual["Parameter"]).ToArray());
443
444          NativeInterpreter.NativeWrapper.GetValuesVarPro(code, code.Length, termIndexList.ToArray(), termIndexList.Count, rows, nRows, coefficients, options, result, target, out var optSummary);
445        }
446
447        #region linear model statistics
448        var xy = new double[nRows, termIndexList.Count + 1];
449        for(int j=0;j<termIndexList.Count;j++) {
450          var t = termIndexList[j];
451          var termCode = new NativeInterpreter.NativeInstruction[code[t].length];
452          Array.Copy(code, t - code[t].length + 1, termCode, 0, code[t].length);
453          options.Iterations = 0;
454          NativeInterpreter.NativeWrapper.GetValues(termCode, termCode.Length, rows, nRows, options, result, null, out _);
455          for(int i=0;i<nRows;i++) { xy[i, j] = result[i]; } // copy to matrix
456        }
457        // copy target to matrix
458        for (int i = 0; i < nRows; i++) { xy[i, termIndexList.Count] = target[i]; } // copy to matrix
459        predictorNames.Add("<const>"); // last coefficient is offset
460        // var stats = Statistics.CalculateLinearModelStatistics(xy, coefficients);
461        // results.AddOrUpdateResult("Statistics", stats.AsResultCollection(predictorNames));
462        #endregion
463
464        results.AddOrUpdateResult("Solution", CreateRegressionSolution(code, termIndexList, coefficients, RegressionProblemData));
465        results.AddOrUpdateResult("BestQuality", new DoubleValue(bestQ));
466        results.AddOrUpdateResult("CV MSE (avg)", bestIndividual["CV MSE (avg)"]);
467        results.AddOrUpdateResult("CV MSE (stdev)", bestIndividual["CV MSE (stdev)"]);
468      }
469    }
470
471    #region retrieval / update of non-linear parameters
472    private double[][] ExtractParameters(NativeInterpreter.NativeInstruction[] code, List<int> termIndexList) {
473      var p = new double[termIndexList.Count][];
474      for (int i = 0; i < termIndexList.Count; i++) {
475        var termIdx = termIndexList[i];
476        var pList = new List<double>();
477        var start = termIdx - code[termIdx].length + 1;
478        for (int codeIdx = start; codeIdx <= termIdx; codeIdx++) {
479          if (code[codeIdx].optimize)
480            pList.Add(code[codeIdx].value);
481        }
482        p[i] = pList.ToArray();
483      }
484      return p;
485    }
486
487    // parameters are given as array for each term
488    private void UpdateParameter(NativeInterpreter.NativeInstruction[] code, List<int> termIndexList, double[][] p) {
489      for (int i = 0; i < termIndexList.Count; i++) {
490        if (p[i].Length == 0) continue; // nothing to update
491        var termIdx = termIndexList[i];
492        var pIdx = 0;
493        var start = termIdx - code[termIdx].length + 1;
494        for (int codeIdx = start; codeIdx <= termIdx; codeIdx++) {
495          if (code[codeIdx].optimize)
496            code[codeIdx].value = p[i][pIdx++];
497        }
498      }
499    }
500
501    // parameters are given as a flat array
502    private void UpdateParameter(NativeInterpreter.NativeInstruction[] code, List<int> termIndexList, double[] p) {
503      var pIdx = 0;
504      for (int i = 0; i < termIndexList.Count; i++) {
505        var termIdx = termIndexList[i];
506        var start = termIdx - code[termIdx].length + 1;
507        for (int codeIdx = start; codeIdx <= termIdx; codeIdx++) {
508          if (code[codeIdx].optimize)
509            code[codeIdx].value = p[pIdx++];
510        }
511      }
512    }
513    #endregion
514
515
516
517    #region feature generation
518    private static ISymbolicExpressionTree[] GenerateFeatures(int n, IRandom random, ISymbolicDataAnalysisGrammar grammar, int maxSize, int maxDepth) {
519      var features = new ISymbolicExpressionTree[n];
520      var hashes = new HashSet<ulong>();
521      int i = 0;
522      while (i < features.Length) {
523        var t = ProbabilisticTreeCreator.Create(random, grammar, maxSize, maxDepth);
524        t = TreeSimplifier.Simplify(t);
525        var th = SymbolicExpressionTreeHash.ComputeHash(t);
526        if (!hashes.Contains(th)) {
527          features[i++] = t;
528          hashes.Add(th);
529        }
530      }
531      return features;
532    }
533    private static ISymbolicExpressionTree[] GenerateFeaturesSystematic(int n, IRandom random, ISymbolicDataAnalysisGrammar grammar, int maxSize, int maxDepth, int maxVariables) {
534      var hashes = new HashSet<ulong>();
535
536      var root = grammar.ProgramRootSymbol.CreateTreeNode();
537      var trees = new List<ISymbolicExpressionTreeNode>();
538      var incompleteTrees = new Queue<ISymbolicExpressionTreeNode>();
539      incompleteTrees.Enqueue(root);
540      while (incompleteTrees.Any() && trees.Count < n) {
541        var t = incompleteTrees.Dequeue();
542        // find first extension point
543        ISymbolicExpressionTreeNode parent = null;
544        var numVariables = 0;
545        int size = 0;
546        int depth = t.GetDepth();
547        foreach (var node in t.IterateNodesPrefix()) {
548          size++;
549          if (node is VariableTreeNodeBase) numVariables++;
550          if (node.SubtreeCount < grammar.GetMinimumSubtreeCount(node.Symbol)) {
551            parent = node;
552            break;
553          }
554        }
555        if (numVariables > maxVariables || size > maxSize || depth > maxDepth) continue;
556        if (parent == null) {
557          // no extension point found => sentence is complete
558          var hash = SymbolicExpressionTreeHash.ComputeHash(t);
559          if (hashes.Add(SymbolicExpressionTreeHash.ComputeHash(t))) {
560            trees.Add((ISymbolicExpressionTreeNode)t.Clone());
561          }
562
563          // check if the (complete) sentence can be extended
564          foreach (var node in t.IterateNodesPrefix()) {
565            if (node.SubtreeCount < grammar.GetMaximumSubtreeCount(node.Symbol)) {
566              parent = node;
567              break;
568            }
569          }
570          if (parent == null) {
571            // no extension possible => continue with next tree in queue
572            continue;
573          }
574        }
575
576        if (parent == null) throw new InvalidProgramException(); // assertion
577
578        // the sentence must / can be extended
579        var allowedChildSy = grammar.GetAllowedChildSymbols(parent.Symbol, parent.SubtreeCount).OrderBy(sy => sy.MinimumArity == 0 ? 0 : 1); // terminals first
580        if (!allowedChildSy.Any()) throw new InvalidProgramException(); // grammar fail
581
582        // make new variants and add them to the queue of incomplete trees
583        foreach (var sy in allowedChildSy) {
584          if (sy is DataAnalysis.Symbolic.Variable variableSy) {
585            // generate all variables
586            foreach (var varName in variableSy.VariableNames) {
587              var varNode = (VariableTreeNode)variableSy.CreateTreeNode();
588              varNode.ResetLocalParameters(random);
589              varNode.VariableName = varName;
590              varNode.Weight = 1.0;
591              parent.AddSubtree(varNode);
592              incompleteTrees.Enqueue((ISymbolicExpressionTreeNode)t.Clone());
593              parent.RemoveSubtree(parent.SubtreeCount - 1); // prepare for next iteration
594            }
595          } else {
596            var node = sy.CreateTreeNode();
597            node.ResetLocalParameters(random);
598            parent.AddSubtree(node);
599            incompleteTrees.Enqueue((ISymbolicExpressionTreeNode)t.Clone());
600            parent.RemoveSubtree(parent.SubtreeCount - 1); // prepare for next iteration
601          }
602        }
603
604      }
605      return trees.Select(r => new SymbolicExpressionTree(r)).ToArray();
606    }
607
608
609    private void GenerateCode(ISymbolicExpressionTree[] features, IDataset dataset) {
610      byte mapSupportedSymbols(ISymbolicExpressionTreeNode node) {
611        var opCode = OpCodes.MapSymbolToOpCode(node);
612        if (supportedOpCodes.Contains(opCode)) return opCode;
613        else throw new NotSupportedException($"VarPro does not support {node.Symbol.Name}");
614      };
615
616      var i = 0;
617      featureIdx = new int[features.Length];
618
619      var code = new List<NativeInterpreter.NativeInstruction>(capacity: (int)(features.Sum(fi => fi.Length - 2) * 1.2));
620      foreach (var f in features) {
621        var featureCode = Compile(f, mapSupportedSymbols);
622
623        code.AddRange(featureCode);
624        featureIdx[i++] = code.Count - 1;
625      }
626      this.code = code.ToArray();
627    }
628
629
630    private static readonly HashSet<byte> supportedOpCodes = new HashSet<byte>() {
631      (byte)OpCode.Constant,
632      (byte)OpCode.Variable,
633      (byte)OpCode.Add,
634      (byte)OpCode.Sub,
635      (byte)OpCode.Mul,
636      (byte)OpCode.Div,
637      (byte)OpCode.Exp,
638      (byte)OpCode.Log,
639      (byte)OpCode.Sin,
640      (byte)OpCode.Cos,
641      (byte)OpCode.Tan,
642      (byte)OpCode.Tanh,
643      // (byte)OpCode.Power,
644      // (byte)OpCode.Root,
645      (byte)OpCode.SquareRoot,
646      (byte)OpCode.Square,
647      (byte)OpCode.CubeRoot,
648      (byte)OpCode.Cube,
649      (byte)OpCode.Absolute,
650      (byte)OpCode.AnalyticQuotient
651    };
652    private NativeInterpreter.NativeInstruction[] Compile(ISymbolicExpressionTree tree, Func<ISymbolicExpressionTreeNode, byte> opCodeMapper) {
653      var root = tree.Root.GetSubtree(0).GetSubtree(0);
654      var code = new List<NativeInterpreter.NativeInstruction>();
655      foreach (var n in root.IterateNodesPrefix()) {
656        var instr = new NativeInterpreter.NativeInstruction { narg = (ushort)n.SubtreeCount, opcode = opCodeMapper(n), length = 1 }; // length is updated in a second pass below
657        if (n is VariableTreeNode variable) {
658          instr.value = variable.Weight;
659          instr.optimize = false;
660          instr.data = cachedData[variable.VariableName].AddrOfPinnedObject();
661        } else if (n is ConstantTreeNode constant) {
662          instr.value = constant.Value;
663          if (n.Symbol.Name != "<1.0>") // HACK TODO this depends on the name given in the grammar!
664            instr.optimize = true; // the VarPro grammar is specifically designed to make sure we have all necessary and only necessary non-linear parameters
665        }
666        if (n.Symbol is Logarithm) {
667          // for log(f(x)) generate code log ( sqrt(f(x)²)) to ensure argument to log is positive
668          code.Add(instr);
669          code.Add(new NativeInterpreter.NativeInstruction { narg = 1, opcode = (byte)OpCode.SquareRoot, length = 1 }); // length is updated in a second pass below
670          code.Add(new NativeInterpreter.NativeInstruction { narg = 1, opcode = (byte)OpCode.Square, length = 1 });
671        } else {
672          // all other operations are added verbatim
673          code.Add(instr);
674        }
675      }
676
677      code.Reverse();
678      var codeArr = code.ToArray();
679
680      // second pass to calculate lengths
681      for (int i = 0; i < codeArr.Length; i++) {
682        var c = i - 1;
683        for (int j = 0; j < codeArr[i].narg; ++j) {
684          codeArr[i].length += codeArr[c].length;
685          c -= codeArr[c].length;
686        }
687      }
688
689      return codeArr;
690    }
691    #endregion
692
693    #region solution creation
694    private IRegressionSolution CreateRegressionSolution(NativeInterpreter.NativeInstruction[] code, IList<int> termIndexList, double[] coefficients, IRegressionProblemData problemData) {
695      // parse back solution from code (required because we need to used optimized parameter values)
696      var addSy = symbols[(byte)OpCode.Add];
697      var mulSy = symbols[(byte)OpCode.Mul];
698      var sum = addSy.CreateTreeNode();
699      for (int i = 0; i < termIndexList.Count; i++) {
700        if (Math.Abs(coefficients[i]) < 1e-8) continue;
701        var termIdx = termIndexList[i];
702        var prod = mulSy.CreateTreeNode();
703        var constNode = (ConstantTreeNode)constSy.CreateTreeNode();
704        constNode.Value = coefficients[i];
705        var term = CreateTree(code, termIdx);
706        prod.AddSubtree(constNode);
707        prod.AddSubtree(term);
708        sum.AddSubtree(prod);
709      }
710      {
711        var constNode = (ConstantTreeNode)constSy.CreateTreeNode();
712        constNode.Value = coefficients.Last();
713        sum.AddSubtree(constNode);
714      }
715      var root = (new ProgramRootSymbol()).CreateTreeNode();
716      var start = (new StartSymbol()).CreateTreeNode();
717      root.AddSubtree(start);
718      start.AddSubtree(sum);
719      var tree = new SymbolicExpressionTree(root);
720      var ds = problemData.Dataset;
721      var scaledDataset = new Dataset(ds.DoubleVariables, ds.ToArray(ds.DoubleVariables, transformations, Enumerable.Range(0, ds.Rows)));
722      var scaledProblemData = new RegressionProblemData(scaledDataset, problemData.AllowedInputVariables, problemData.TargetVariable, transformations);
723      scaledProblemData.TrainingPartition.Start = problemData.TrainingPartition.Start;
724      scaledProblemData.TrainingPartition.End = problemData.TrainingPartition.End;
725      scaledProblemData.TestPartition.Start = problemData.TestPartition.Start;
726      scaledProblemData.TestPartition.End = problemData.TestPartition.End;
727      return new SymbolicRegressionSolution(
728        new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeNativeInterpreter()), scaledProblemData);
729    }
730
731    Dictionary<byte, Symbol> symbols = new Dictionary<byte, Symbol>() {
732      {(byte)OpCode.Add, new Addition()  },
733      {(byte)OpCode.Sub, new Subtraction()  },
734      {(byte)OpCode.Mul, new Multiplication()  },
735      {(byte)OpCode.Div, new Division()  },
736      {(byte)OpCode.Exp, new Exponential()  },
737      {(byte)OpCode.Log, new Logarithm()  },
738      {(byte)OpCode.Sin, new Sine()  },
739      {(byte)OpCode.Cos, new Cosine()  },
740      {(byte)OpCode.Tan, new Tangent()  },
741      {(byte)OpCode.Tanh, new HyperbolicTangent()  },
742      {(byte)OpCode.Square, new Square()  },
743      {(byte)OpCode.SquareRoot, new SquareRoot()  },
744      {(byte)OpCode.Cube, new Cube()  },
745      {(byte)OpCode.CubeRoot, new CubeRoot()  },
746      {(byte)OpCode.Absolute, new Absolute()  },
747      {(byte)OpCode.AnalyticQuotient, new AnalyticQuotient()  },
748    };
749
750    // used for solutions only
751    Symbol constSy = new Constant();
752    Symbol varSy = new DataAnalysis.Symbolic.Variable();
753
754
755    private ISymbolicExpressionTreeNode CreateTree(NativeInterpreter.NativeInstruction[] code, int i) {
756      switch (code[i].opcode) {
757        case (byte)OpCode.Add:
758        case (byte)OpCode.Sub:
759        case (byte)OpCode.Mul:
760        case (byte)OpCode.Div:
761        case (byte)OpCode.Exp:
762        case (byte)OpCode.Log:
763        case (byte)OpCode.Sin:
764        case (byte)OpCode.Cos:
765        case (byte)OpCode.Tan:
766        case (byte)OpCode.Tanh:
767        case (byte)OpCode.Square:
768        case (byte)OpCode.SquareRoot:
769        case (byte)OpCode.Cube:
770        case (byte)OpCode.CubeRoot:
771        case (byte)OpCode.Absolute:
772        case (byte)OpCode.AnalyticQuotient: {
773            var node = symbols[code[i].opcode].CreateTreeNode();
774            var c = i - 1;
775            for (int childIdx = 0; childIdx < code[i].narg; childIdx++) {
776              node.AddSubtree(CreateTree(code, c));
777              c = c - code[c].length;
778            }
779            return node;
780          }
781        case (byte)OpCode.Constant: {
782            var node = (ConstantTreeNode)constSy.CreateTreeNode();
783            node.Value = code[i].value;
784            return node;
785          }
786        case (byte)OpCode.Variable: {
787            var node = (VariableTreeNode)varSy.CreateTreeNode();
788            node.Weight = code[i].value;
789            // TODO perf
790            node.VariableName = string.Empty;
791            foreach (var tup in this.cachedData) {
792              if (tup.Value.AddrOfPinnedObject() == code[i].data) {
793                node.VariableName = tup.Key;
794                break;
795              }
796            }
797            return node;
798          }
799        default: throw new NotSupportedException("unknown opcode");
800      }
801    }
802    #endregion
803
804    public void Load(IRegressionProblemData data) {
805      RegressionProblemData = data;
806    }
807
808    private void InitializeOperators() {
809      Operators.Add(new AlleleFrequencyAnalyzer());
810
811      var cvMSEAnalyzer = new BestAverageWorstQualityAnalyzer();
812      cvMSEAnalyzer.Name = "CVMSE Analzer";
813      ParameterizeAnalyzer(cvMSEAnalyzer, "CV MSE (avg)");
814      Operators.Add(cvMSEAnalyzer);
815     
816      var trainingMSEAnalyzer = new BestAverageWorstQualityAnalyzer();
817      trainingMSEAnalyzer.Name = "Training MSE Analzer";
818      ParameterizeAnalyzer(trainingMSEAnalyzer, "Train MSE (avg)");
819      Operators.Add(trainingMSEAnalyzer);
820
821      ParameterizeOperators();
822    }
823
824    private void ParameterizeAnalyzer(BestAverageWorstQualityAnalyzer analyzer, string qualityName) {
825      analyzer.QualityParameter.ActualName = qualityName;
826      analyzer.QualitiesParameter.ActualName = qualityName + " " + analyzer.QualitiesParameter.ActualName;
827      analyzer.BestQualityParameter.ActualName += " " + qualityName;
828      analyzer.CurrentAverageQualityParameter.ActualName += " " + qualityName;
829      analyzer.CurrentBestQualityParameter.ActualName += " " + qualityName;
830      analyzer.CurrentWorstQualityParameter.ActualName += " " + qualityName;
831      analyzer.BestKnownQualityParameter.ActualName += " " + qualityName;
832      analyzer.AbsoluteDifferenceBestKnownToBestParameter.ActualName += " " + qualityName;
833      analyzer.RelativeDifferenceBestKnownToBestParameter.ActualName += " " + qualityName;
834    }
835
836    private void ParameterizeOperators() {
837      foreach (var op in Operators) {
838        if (op is AlleleFrequencyAnalyzer alleleAnalyzer) {
839          alleleAnalyzer.SolutionParameter.ActualName = Encoding.Name;
840        }
841        if (op is MultiAnalyzer multiAnalyzer) {
842          var freqAnalyzer = Operators.OfType<AlleleFrequencyAnalyzer>().First();
843          multiAnalyzer.Operators.SetItemCheckedState(freqAnalyzer, true);
844        }
845      }
846      foreach (var op in Encoding.Operators) {
847        if (op is SomePositionsBitflipManipulator multiFlipManipulator) {
848          multiFlipManipulator.MutationProbabilityParameter.Value.Value = 1.0 / Encoding.Length; // one feature on average
849        } else if (op is RandomBinaryVectorCreator creator) {
850          creator.TrueProbability.Value = 20.0 / Encoding.Length; // 20 features on average
851        }
852      }
853    }
854  }
855}
Note: See TracBrowser for help on using the repository browser.