Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2925_AutoDiffForDynamicalModels/HeuristicLab.Problems.DynamicalSystemsModelling/3.3/Problem.cs @ 16214

Last change on this file since 16214 was 16214, checked in by gkronber, 6 years ago

#2925 added datasets from supplementary material for "distilling free-form laws ..." (oscillator, pendulum, double pendulum, ...)

File size: 41.2 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Diagnostics;
25using System.Linq;
26using HeuristicLab.Analysis;
27using HeuristicLab.Collections;
28using HeuristicLab.Common;
29using HeuristicLab.Core;
30using HeuristicLab.Data;
31using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
32using HeuristicLab.Optimization;
33using HeuristicLab.Parameters;
34using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
35using HeuristicLab.Problems.DataAnalysis;
36using HeuristicLab.Problems.DataAnalysis.Symbolic;
37using HeuristicLab.Problems.Instances;
38using Variable = HeuristicLab.Problems.DataAnalysis.Symbolic.Variable;
39
40namespace HeuristicLab.Problems.DynamicalSystemsModelling {
41  public class Vector {
42    public readonly static Vector Zero = new Vector(new double[0]);
43
44    public static Vector operator +(Vector a, Vector b) {
45      if(a == Zero) return b;
46      if(b == Zero) return a;
47      Debug.Assert(a.arr.Length == b.arr.Length);
48      var res = new double[a.arr.Length];
49      for(int i = 0; i < res.Length; i++)
50        res[i] = a.arr[i] + b.arr[i];
51      return new Vector(res);
52    }
53    public static Vector operator -(Vector a, Vector b) {
54      if(b == Zero) return a;
55      if(a == Zero) return -b;
56      Debug.Assert(a.arr.Length == b.arr.Length);
57      var res = new double[a.arr.Length];
58      for(int i = 0; i < res.Length; i++)
59        res[i] = a.arr[i] - b.arr[i];
60      return new Vector(res);
61    }
62    public static Vector operator -(Vector v) {
63      if(v == Zero) return Zero;
64      for(int i = 0; i < v.arr.Length; i++)
65        v.arr[i] = -v.arr[i];
66      return v;
67    }
68
69    public static Vector operator *(double s, Vector v) {
70      if(v == Zero) return Zero;
71      if(s == 0.0) return Zero;
72      var res = new double[v.arr.Length];
73      for(int i = 0; i < res.Length; i++)
74        res[i] = s * v.arr[i];
75      return new Vector(res);
76    }
77    public static Vector operator *(Vector v, double s) {
78      return s * v;
79    }
80    public static Vector operator /(double s, Vector v) {
81      if(s == 0.0) return Zero;
82      if(v == Zero) throw new ArgumentException("Division by zero vector");
83      var res = new double[v.arr.Length];
84      for(int i = 0; i < res.Length; i++)
85        res[i] = 1.0 / v.arr[i];
86      return new Vector(res);
87    }
88    public static Vector operator /(Vector v, double s) {
89      return v * 1.0 / s;
90    }
91
92
93    private readonly double[] arr; // backing array;
94
95    public Vector(double[] v) {
96      this.arr = v;
97    }
98
99    public void CopyTo(double[] target) {
100      Debug.Assert(arr.Length <= target.Length);
101      Array.Copy(arr, target, arr.Length);
102    }
103  }
104
105  [Item("Dynamical Systems Modelling Problem", "TODO")]
106  [Creatable(CreatableAttribute.Categories.GeneticProgrammingProblems, Priority = 900)]
107  [StorableClass]
108  public sealed class Problem : SingleObjectiveBasicProblem<MultiEncoding>, IRegressionProblem, IProblemInstanceConsumer<IRegressionProblemData>, IProblemInstanceExporter<IRegressionProblemData> {
109
110    #region parameter names
111    private const string ProblemDataParameterName = "Data";
112    private const string TargetVariablesParameterName = "Target variables";
113    private const string FunctionSetParameterName = "Function set";
114    private const string MaximumLengthParameterName = "Size limit";
115    private const string MaximumParameterOptimizationIterationsParameterName = "Max. parameter optimization iterations";
116    private const string NumberOfLatentVariablesParameterName = "Number of latent variables";
117    private const string NumericIntegrationStepsParameterName = "Steps for numeric integration";
118    private const string TrainingEpisodesParameterName = "Training episodes";
119    private const string OptimizeParametersForEpisodesParameterName = "Optimize parameters for episodes";
120    #endregion
121
122    #region Parameter Properties
123    IParameter IDataAnalysisProblem.ProblemDataParameter { get { return ProblemDataParameter; } }
124
125    public IValueParameter<IRegressionProblemData> ProblemDataParameter {
126      get { return (IValueParameter<IRegressionProblemData>)Parameters[ProblemDataParameterName]; }
127    }
128    public IValueParameter<ReadOnlyCheckedItemCollection<StringValue>> TargetVariablesParameter {
129      get { return (IValueParameter<ReadOnlyCheckedItemCollection<StringValue>>)Parameters[TargetVariablesParameterName]; }
130    }
131    public IValueParameter<ReadOnlyCheckedItemCollection<StringValue>> FunctionSetParameter {
132      get { return (IValueParameter<ReadOnlyCheckedItemCollection<StringValue>>)Parameters[FunctionSetParameterName]; }
133    }
134    public IFixedValueParameter<IntValue> MaximumLengthParameter {
135      get { return (IFixedValueParameter<IntValue>)Parameters[MaximumLengthParameterName]; }
136    }
137    public IFixedValueParameter<IntValue> MaximumParameterOptimizationIterationsParameter {
138      get { return (IFixedValueParameter<IntValue>)Parameters[MaximumParameterOptimizationIterationsParameterName]; }
139    }
140    public IFixedValueParameter<IntValue> NumberOfLatentVariablesParameter {
141      get { return (IFixedValueParameter<IntValue>)Parameters[NumberOfLatentVariablesParameterName]; }
142    }
143    public IFixedValueParameter<IntValue> NumericIntegrationStepsParameter {
144      get { return (IFixedValueParameter<IntValue>)Parameters[NumericIntegrationStepsParameterName]; }
145    }
146    public IValueParameter<ItemList<IntRange>> TrainingEpisodesParameter {
147      get { return (IValueParameter<ItemList<IntRange>>)Parameters[TrainingEpisodesParameterName]; }
148    }
149    public IFixedValueParameter<BoolValue> OptimizeParametersForEpisodesParameter {
150      get { return (IFixedValueParameter<BoolValue>)Parameters[OptimizeParametersForEpisodesParameterName]; }
151    }
152    #endregion
153
154    #region Properties
155    public IRegressionProblemData ProblemData {
156      get { return ProblemDataParameter.Value; }
157      set { ProblemDataParameter.Value = value; }
158    }
159    IDataAnalysisProblemData IDataAnalysisProblem.ProblemData { get { return ProblemData; } }
160
161    public ReadOnlyCheckedItemCollection<StringValue> TargetVariables {
162      get { return TargetVariablesParameter.Value; }
163    }
164
165    public ReadOnlyCheckedItemCollection<StringValue> FunctionSet {
166      get { return FunctionSetParameter.Value; }
167    }
168
169    public int MaximumLength {
170      get { return MaximumLengthParameter.Value.Value; }
171    }
172    public int MaximumParameterOptimizationIterations {
173      get { return MaximumParameterOptimizationIterationsParameter.Value.Value; }
174    }
175    public int NumberOfLatentVariables {
176      get { return NumberOfLatentVariablesParameter.Value.Value; }
177    }
178    public int NumericIntegrationSteps {
179      get { return NumericIntegrationStepsParameter.Value.Value; }
180    }
181    public IEnumerable<IntRange> TrainingEpisodes {
182      get { return TrainingEpisodesParameter.Value; }
183    }
184    public bool OptimizeParametersForEpisodes {
185      get { return OptimizeParametersForEpisodesParameter.Value.Value; }
186    }
187
188    #endregion
189
190    public event EventHandler ProblemDataChanged;
191
192    public override bool Maximization {
193      get { return false; } // we minimize NMSE
194    }
195
196    #region item cloning and persistence
197    // persistence
198    [StorableConstructor]
199    private Problem(bool deserializing) : base(deserializing) { }
200    [StorableHook(HookType.AfterDeserialization)]
201    private void AfterDeserialization() {
202      if(!Parameters.ContainsKey(OptimizeParametersForEpisodesParameterName)) {
203        Parameters.Add(new FixedValueParameter<BoolValue>(OptimizeParametersForEpisodesParameterName, "Flag to select if parameters should be optimized globally or for each episode individually.", new BoolValue(false)));
204      }
205      RegisterEventHandlers();
206    }
207
208    // cloning
209    private Problem(Problem original, Cloner cloner)
210      : base(original, cloner) {
211      RegisterEventHandlers();
212    }
213    public override IDeepCloneable Clone(Cloner cloner) { return new Problem(this, cloner); }
214    #endregion
215
216    public Problem()
217      : base() {
218      var targetVariables = new CheckedItemCollection<StringValue>().AsReadOnly(); // HACK: it would be better to provide a new class derived from IDataAnalysisProblem
219      var functions = CreateFunctionSet();
220      Parameters.Add(new ValueParameter<IRegressionProblemData>(ProblemDataParameterName, "The data captured from the dynamical system. Use CSV import functionality to import data.", new RegressionProblemData()));
221      Parameters.Add(new ValueParameter<ReadOnlyCheckedItemCollection<StringValue>>(TargetVariablesParameterName, "Target variables (overrides setting in ProblemData)", targetVariables));
222      Parameters.Add(new ValueParameter<ReadOnlyCheckedItemCollection<StringValue>>(FunctionSetParameterName, "The list of allowed functions", functions));
223      Parameters.Add(new FixedValueParameter<IntValue>(MaximumLengthParameterName, "The maximally allowed length of each expression. Set to a small value (5 - 25). Default = 10", new IntValue(10)));
224      Parameters.Add(new FixedValueParameter<IntValue>(MaximumParameterOptimizationIterationsParameterName, "The maximum number of iterations for optimization of parameters (using L-BFGS). More iterations makes the algorithm slower, fewer iterations might prevent convergence in the optimization scheme. Default = 100", new IntValue(100)));
225      Parameters.Add(new FixedValueParameter<IntValue>(NumberOfLatentVariablesParameterName, "Latent variables (unobserved variables) allow us to produce expressions which are integrated up and can be used in other expressions. They are handled similarly to target variables in forward simulation / integration. The difference to target variables is that there are no data to which the calculated values of latent variables are compared. Set to a small value (0 .. 5) as necessary (default = 0)", new IntValue(0)));
226      Parameters.Add(new FixedValueParameter<IntValue>(NumericIntegrationStepsParameterName, "Number of steps in the numeric integration that are taken from one row to the next (set to 1 to 100). More steps makes the algorithm slower, less steps worsens the accuracy of the numeric integration scheme.", new IntValue(10)));
227      Parameters.Add(new ValueParameter<ItemList<IntRange>>(TrainingEpisodesParameterName, "A list of ranges that should be used for training, each range represents an independent episode. This overrides the TrainingSet parameter in ProblemData.", new ItemList<IntRange>()));
228      Parameters.Add(new FixedValueParameter<BoolValue>(OptimizeParametersForEpisodesParameterName, "Flag to select if parameters should be optimized globally or for each episode individually.", new BoolValue(false)));
229      RegisterEventHandlers();
230      InitAllParameters();
231
232      // TODO: do not clear selection of target variables when the input variables are changed (keep selected target variables)
233      // TODO: UI hangs when selecting / deselecting input variables because the encoding is updated on each item
234
235    }
236
237    public override double Evaluate(Individual individual, IRandom random) {
238      var trees = individual.Values.Select(v => v.Value).OfType<ISymbolicExpressionTree>().ToArray(); // extract all trees from individual
239
240      if(OptimizeParametersForEpisodes) {
241        int eIdx = 0; 
242        double totalNMSE = 0.0;
243        int totalSize = 0;
244        foreach(var episode in TrainingEpisodes) {
245          double[] optTheta;
246          double nmse;
247          OptimizeForEpisodes(trees, random, new[] { episode }, out optTheta, out nmse);
248          individual["OptTheta_" + eIdx] = new DoubleArray(optTheta); // write back optimized parameters so that we can use them in the Analysis method
249          eIdx++;
250          totalNMSE += nmse * episode.Size;
251          totalSize += episode.Size;
252        }
253        return totalNMSE / totalSize;
254      } else {
255        double[] optTheta;
256        double nmse;
257        OptimizeForEpisodes(trees, random, TrainingEpisodes, out optTheta, out nmse);
258        individual["OptTheta"] = new DoubleArray(optTheta); // write back optimized parameters so that we can use them in the Analysis method
259        return nmse;
260      }
261    }
262
263    private void OptimizeForEpisodes(ISymbolicExpressionTree[] trees, IRandom random, IEnumerable<IntRange> episodes, out double[] optTheta, out double nmse) {
264      var rows = episodes.SelectMany(e => Enumerable.Range(e.Start, e.End - e.Start)).ToArray();
265      var problemData = ProblemData;
266      var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
267      var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees
268      var targetValues = new double[rows.Length, targetVars.Length];
269
270      // collect values of all target variables
271      var colIdx = 0;
272      foreach(var targetVar in targetVars) {
273        int rowIdx = 0;
274        foreach(var value in problemData.Dataset.GetDoubleValues(targetVar, rows)) {
275          targetValues[rowIdx, colIdx] = value;
276          rowIdx++;
277        }
278        colIdx++;
279      }
280
281      var nodeIdx = new Dictionary<ISymbolicExpressionTreeNode, int>();
282
283      foreach(var tree in trees) {
284        foreach(var node in tree.Root.IterateNodesPrefix().Where(n => IsConstantNode(n))) {
285          nodeIdx.Add(node, nodeIdx.Count);
286        }
287      }
288
289      var theta = nodeIdx.Select(_ => random.NextDouble() * 2.0 - 1.0).ToArray(); // init params randomly from Unif(-1,1)
290
291      optTheta = new double[0];
292      if(theta.Length > 0) {
293        alglib.minlbfgsstate state;
294        alglib.minlbfgsreport report;
295        alglib.minlbfgscreate(Math.Min(theta.Length, 5), theta, out state);
296        alglib.minlbfgssetcond(state, 0.0, 0.0, 0.0, MaximumParameterOptimizationIterations);
297        alglib.minlbfgsoptimize(state, EvaluateObjectiveAndGradient, null,
298          new object[] { trees, targetVars, problemData, nodeIdx, targetValues, episodes.ToArray(), NumericIntegrationSteps, latentVariables }); //TODO: create a type
299        alglib.minlbfgsresults(state, out optTheta, out report);
300
301        /*
302         *
303         *         L-BFGS algorithm results
304
305          INPUT PARAMETERS:
306              State   -   algorithm state
307
308          OUTPUT PARAMETERS:
309              X       -   array[0..N-1], solution
310              Rep     -   optimization report:
311                          * Rep.TerminationType completetion code:
312                              * -7    gradient verification failed.
313                                      See MinLBFGSSetGradientCheck() for more information.
314                              * -2    rounding errors prevent further improvement.
315                                      X contains best point found.
316                              * -1    incorrect parameters were specified
317                              *  1    relative function improvement is no more than
318                                      EpsF.
319                              *  2    relative step is no more than EpsX.
320                              *  4    gradient norm is no more than EpsG
321                              *  5    MaxIts steps was taken
322                              *  7    stopping conditions are too stringent,
323                                      further improvement is impossible
324                          * Rep.IterationsCount contains iterations count
325                          * NFEV countains number of function calculations
326         */
327        if(report.terminationtype < 0) { nmse = 10E6; return; }
328      }
329
330      // perform evaluation for optimal theta to get quality value
331      double[] grad = new double[optTheta.Length];
332      nmse = double.NaN;
333      EvaluateObjectiveAndGradient(optTheta, ref nmse, grad,
334        new object[] { trees, targetVars, problemData, nodeIdx, targetValues, episodes.ToArray(), NumericIntegrationSteps, latentVariables });
335      if(double.IsNaN(nmse) || double.IsInfinity(nmse)) { nmse = 10E6; return; } // return a large value (TODO: be consistent by using NMSE)
336    }
337
338    private static void EvaluateObjectiveAndGradient(double[] x, ref double f, double[] grad, object obj) {
339      var trees = (ISymbolicExpressionTree[])((object[])obj)[0];
340      var targetVariables = (string[])((object[])obj)[1];
341      var problemData = (IRegressionProblemData)((object[])obj)[2];
342      var nodeIdx = (Dictionary<ISymbolicExpressionTreeNode, int>)((object[])obj)[3];
343      var targetValues = (double[,])((object[])obj)[4];
344      var episodes = (IntRange[])((object[])obj)[5];
345      var numericIntegrationSteps = (int)((object[])obj)[6];
346      var latentVariables = (string[])((object[])obj)[7];
347
348      var predicted = Integrate(
349        trees,  // we assume trees contain expressions for the change of each target variable over time y'(t)
350        problemData.Dataset,
351        problemData.AllowedInputVariables.ToArray(),
352        targetVariables,
353        latentVariables,
354        episodes,
355        nodeIdx,
356        x, numericIntegrationSteps).ToArray();
357
358
359      // for normalized MSE = 1/variance(t) * MSE(t, pred)
360      // TODO: Perf. (by standardization of target variables before evaluation of all trees)     
361      var invVar = Enumerable.Range(0, targetVariables.Length)
362        .Select(c => Enumerable.Range(0, targetValues.GetLength(0)).Select(row => targetValues[row, c])) // column vectors
363        .Select(vec => vec.Variance())
364        .Select(v => 1.0 / v)
365        .ToArray();
366
367      // objective function is NMSE
368      f = 0.0;
369      int n = predicted.Length;
370      double invN = 1.0 / n;
371      var g = Vector.Zero;
372      int r = 0;
373      foreach(var y_pred in predicted) {
374        for(int c = 0; c < y_pred.Length; c++) {
375
376          var y_pred_f = y_pred[c].Item1;
377          var y = targetValues[r, c];
378
379          var res = (y - y_pred_f);
380          var ressq = res * res;
381          f += ressq * invN * invVar[c];
382          g += -2.0 * res * y_pred[c].Item2 * invN * invVar[c];
383        }
384        r++;
385      }
386
387      g.CopyTo(grad);
388    }
389
390    public override void Analyze(Individual[] individuals, double[] qualities, ResultCollection results, IRandom random) {
391      base.Analyze(individuals, qualities, results, random);
392
393      if(!results.ContainsKey("Prediction (training)")) {
394        results.Add(new Result("Prediction (training)", typeof(ReadOnlyItemList<DataTable>)));
395      }
396      if(!results.ContainsKey("Prediction (test)")) {
397        results.Add(new Result("Prediction (test)", typeof(ReadOnlyItemList<DataTable>)));
398      }
399      if(!results.ContainsKey("Models")) {
400        results.Add(new Result("Models", typeof(VariableCollection)));
401      }
402
403      var bestIndividualAndQuality = this.GetBestIndividual(individuals, qualities);
404      var trees = bestIndividualAndQuality.Item1.Values.Select(v => v.Value).OfType<ISymbolicExpressionTree>().ToArray(); // extract all trees from individual
405
406      // TODO extract common functionality from Evaluate and Analyze
407      var nodeIdx = new Dictionary<ISymbolicExpressionTreeNode, int>();
408      foreach(var tree in trees) {
409        foreach(var node in tree.Root.IterateNodesPrefix().Where(n => IsConstantNode(n))) {
410          nodeIdx.Add(node, nodeIdx.Count);
411        }
412      }
413      var problemData = ProblemData;
414      var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
415      var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees
416
417      var trainingList = new ItemList<DataTable>();
418
419      if(OptimizeParametersForEpisodes) {
420        var eIdx = 0;
421        var trainingPredictions = new List<Tuple<double, Vector>[][]>();
422        foreach(var episode in TrainingEpisodes) {
423          var episodes = new[] { episode };
424          var optTheta = ((DoubleArray)bestIndividualAndQuality.Item1["OptTheta_" + eIdx]).ToArray(); // see evaluate
425          var trainingPrediction = Integrate(
426                                   trees,  // we assume trees contain expressions for the change of each target variable over time y'(t)
427                                   problemData.Dataset,
428                                   problemData.AllowedInputVariables.ToArray(),
429                                   targetVars,
430                                   latentVariables,
431                                   episodes,
432                                   nodeIdx,
433                                   optTheta,
434                                   NumericIntegrationSteps).ToArray();
435          trainingPredictions.Add(trainingPrediction);
436          eIdx++;
437        }
438
439        // only for actual target values
440        var trainingRows = TrainingEpisodes.SelectMany(e => Enumerable.Range(e.Start, e.End - e.Start));
441        for(int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
442          var targetVar = targetVars[colIdx];
443          var trainingDataTable = new DataTable(targetVar + " prediction (training)");
444          var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, trainingRows));
445          var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, trainingPredictions.SelectMany(arr => arr.Select(row => row[colIdx].Item1)).ToArray());
446          trainingDataTable.Rows.Add(actualValuesRow);
447          trainingDataTable.Rows.Add(predictedValuesRow);
448          trainingList.Add(trainingDataTable);
449        }
450        results["Prediction (training)"].Value = trainingList.AsReadOnly();
451
452
453        var models = new VariableCollection();
454
455        foreach(var tup in targetVars.Zip(trees, Tuple.Create)) {
456          var targetVarName = tup.Item1;
457          var tree = tup.Item2;
458
459          var origTreeVar = new HeuristicLab.Core.Variable(targetVarName + "(original)");
460          origTreeVar.Value = (ISymbolicExpressionTree)tree.Clone();
461          models.Add(origTreeVar);
462        }
463        results["Models"].Value = models;
464      } else {
465        var optTheta = ((DoubleArray)bestIndividualAndQuality.Item1["OptTheta"]).ToArray(); // see evaluate
466        var trainingPrediction = Integrate(
467                                   trees,  // we assume trees contain expressions for the change of each target variable over time y'(t)
468                                   problemData.Dataset,
469                                   problemData.AllowedInputVariables.ToArray(),
470                                   targetVars,
471                                   latentVariables,
472                                   TrainingEpisodes,
473                                   nodeIdx,
474                                   optTheta,
475                                   NumericIntegrationSteps).ToArray();
476        // only for actual target values
477        var trainingRows = TrainingEpisodes.SelectMany(e => Enumerable.Range(e.Start, e.End - e.Start));
478        for(int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
479          var targetVar = targetVars[colIdx];
480          var trainingDataTable = new DataTable(targetVar + " prediction (training)");
481          var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, trainingRows));
482          var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, trainingPrediction.Select(arr => arr[colIdx].Item1).ToArray());
483          trainingDataTable.Rows.Add(actualValuesRow);
484          trainingDataTable.Rows.Add(predictedValuesRow);
485          trainingList.Add(trainingDataTable);
486        }
487        // TODO: DRY for training and test
488        var testList = new ItemList<DataTable>();
489        var testRows = ProblemData.TestIndices.ToArray();
490        var testPrediction = Integrate(
491         trees,  // we assume trees contain expressions for the change of each target variable over time y'(t)
492         problemData.Dataset,
493         problemData.AllowedInputVariables.ToArray(),
494         targetVars,
495         latentVariables,
496         new IntRange[] { ProblemData.TestPartition },
497         nodeIdx,
498         optTheta,
499         NumericIntegrationSteps).ToArray();
500
501        for(int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
502          var targetVar = targetVars[colIdx];
503          var testDataTable = new DataTable(targetVar + " prediction (test)");
504          var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, testRows));
505          var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, testPrediction.Select(arr => arr[colIdx].Item1).ToArray());
506          testDataTable.Rows.Add(actualValuesRow);
507          testDataTable.Rows.Add(predictedValuesRow);
508          testList.Add(testDataTable);
509        }
510
511        results["Prediction (training)"].Value = trainingList.AsReadOnly();
512        results["Prediction (test)"].Value = testList.AsReadOnly();
513        #region simplification of models
514        // TODO the dependency of HeuristicLab.Problems.DataAnalysis.Symbolic is not ideal
515        var models = new VariableCollection();    // to store target var names and original version of tree
516
517        foreach(var tup in targetVars.Zip(trees, Tuple.Create)) {
518          var targetVarName = tup.Item1;
519          var tree = tup.Item2;
520
521          // when we reference HeuristicLab.Problems.DataAnalysis.Symbolic we can translate symbols
522          int nextParIdx = 0;
523          var shownTree = new SymbolicExpressionTree(TranslateTreeNode(tree.Root, optTheta, ref nextParIdx));
524
525          // var shownTree = (SymbolicExpressionTree)tree.Clone();
526          // var constantsNodeOrig = tree.IterateNodesPrefix().Where(IsConstantNode);
527          // var constantsNodeShown = shownTree.IterateNodesPrefix().Where(IsConstantNode);
528          //
529          // foreach (var n in constantsNodeOrig.Zip(constantsNodeShown, (original, shown) => new { original, shown })) {
530          //   double constantsVal = optTheta[nodeIdx[n.original]];
531          //
532          //   ConstantTreeNode replacementNode = new ConstantTreeNode(new Constant()) { Value = constantsVal };
533          //
534          //   var parentNode = n.shown.Parent;
535          //   int replacementIndex = parentNode.IndexOfSubtree(n.shown);
536          //   parentNode.RemoveSubtree(replacementIndex);
537          //   parentNode.InsertSubtree(replacementIndex, replacementNode);
538          // }
539
540          var origTreeVar = new HeuristicLab.Core.Variable(targetVarName + "(original)");
541          origTreeVar.Value = (ISymbolicExpressionTree)tree.Clone();
542          models.Add(origTreeVar);
543          var simplifiedTreeVar = new HeuristicLab.Core.Variable(targetVarName + "(simplified)");
544          simplifiedTreeVar.Value = TreeSimplifier.Simplify(shownTree);
545          models.Add(simplifiedTreeVar);
546
547        }
548        results["Models"].Value = models;
549        #endregion
550      }
551    }
552
553    private ISymbolicExpressionTreeNode TranslateTreeNode(ISymbolicExpressionTreeNode n, double[] parameterValues, ref int nextParIdx) {
554      ISymbolicExpressionTreeNode translatedNode = null;
555      if(n.Symbol is StartSymbol) {
556        translatedNode = new StartSymbol().CreateTreeNode();
557      } else if(n.Symbol is ProgramRootSymbol) {
558        translatedNode = new ProgramRootSymbol().CreateTreeNode();
559      } else if(n.Symbol.Name == "+") {
560        translatedNode = new Addition().CreateTreeNode();
561      } else if(n.Symbol.Name == "-") {
562        translatedNode = new Subtraction().CreateTreeNode();
563      } else if(n.Symbol.Name == "*") {
564        translatedNode = new Multiplication().CreateTreeNode();
565      } else if(n.Symbol.Name == "%") {
566        translatedNode = new Division().CreateTreeNode();
567      } else if(IsConstantNode(n)) {
568        var constNode = (ConstantTreeNode)new Constant().CreateTreeNode();
569        constNode.Value = parameterValues[nextParIdx];
570        nextParIdx++;
571        translatedNode = constNode;
572      } else {
573        // assume a variable name
574        var varName = n.Symbol.Name;
575        var varNode = (VariableTreeNode)new Variable().CreateTreeNode();
576        varNode.Weight = 1.0;
577        varNode.VariableName = varName;
578        translatedNode = varNode;
579      }
580      foreach(var child in n.Subtrees) {
581        translatedNode.AddSubtree(TranslateTreeNode(child, parameterValues, ref nextParIdx));
582      }
583      return translatedNode;
584    }
585
586    #region interpretation
587    private static IEnumerable<Tuple<double, Vector>[]> Integrate(
588      ISymbolicExpressionTree[] trees, IDataset dataset, string[] inputVariables, string[] targetVariables, string[] latentVariables, IEnumerable<IntRange> episodes,
589      Dictionary<ISymbolicExpressionTreeNode, int> nodeIdx, double[] parameterValues, int numericIntegrationSteps = 100) {
590
591      int NUM_STEPS = numericIntegrationSteps;
592      double h = 1.0 / NUM_STEPS;
593
594      foreach(var episode in episodes) {
595        var rows = Enumerable.Range(episode.Start, episode.End - episode.Start);
596        // return first value as stored in the dataset
597        yield return targetVariables
598          .Select(targetVar => Tuple.Create(dataset.GetDoubleValue(targetVar, rows.First()), Vector.Zero))
599          .ToArray();
600
601        // integrate forward starting with known values for the target in t0
602
603        var variableValues = new Dictionary<string, Tuple<double, Vector>>();
604        var t0 = rows.First();
605        foreach(var varName in inputVariables) {
606          variableValues.Add(varName, Tuple.Create(dataset.GetDoubleValue(varName, t0), Vector.Zero));
607        }
608        foreach(var varName in targetVariables) {
609          variableValues.Add(varName, Tuple.Create(dataset.GetDoubleValue(varName, t0), Vector.Zero));
610        }
611        // add value entries for latent variables which are also integrated
612        foreach(var latentVar in latentVariables) {
613          variableValues.Add(latentVar, Tuple.Create(0.0, Vector.Zero)); // we don't have observations for latent variables -> assume zero as starting value
614        }
615        var calculatedVariables = targetVariables.Concat(latentVariables); // TODO: must conincide with the order of trees in the encoding
616
617        foreach(var t in rows.Skip(1)) {
618          for(int step = 0; step < NUM_STEPS; step++) {
619            var deltaValues = new Dictionary<string, Tuple<double, Vector>>();
620            foreach(var tup in trees.Zip(calculatedVariables, Tuple.Create)) {
621              var tree = tup.Item1;
622              var targetVarName = tup.Item2;
623              // skip programRoot and startSymbol
624              var res = InterpretRec(tree.Root.GetSubtree(0).GetSubtree(0), variableValues, nodeIdx, parameterValues);
625              deltaValues.Add(targetVarName, res);
626            }
627
628            // update variableValues for next step
629            foreach(var kvp in deltaValues) {
630              var oldVal = variableValues[kvp.Key];
631              variableValues[kvp.Key] = Tuple.Create(
632                oldVal.Item1 + h * kvp.Value.Item1,
633                oldVal.Item2 + h * kvp.Value.Item2
634              );
635            }
636          }
637
638          // only return the target variables for calculation of errors
639          yield return targetVariables
640            .Select(targetVar => variableValues[targetVar])
641            .ToArray();
642
643          // update for next time step
644          foreach(var varName in inputVariables) {
645            variableValues[varName] = Tuple.Create(dataset.GetDoubleValue(varName, t), Vector.Zero);
646          }
647        }
648      }
649    }
650
651    private static Tuple<double, Vector> InterpretRec(
652      ISymbolicExpressionTreeNode node,
653      Dictionary<string, Tuple<double, Vector>> variableValues,
654      Dictionary<ISymbolicExpressionTreeNode, int> nodeIdx,
655      double[] parameterValues
656        ) {
657
658      switch(node.Symbol.Name) {
659        case "+": {
660            var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); // TODO capture all parameters into a state type for interpretation
661            var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
662
663            return Tuple.Create(l.Item1 + r.Item1, l.Item2 + r.Item2);
664          }
665        case "*": {
666            var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
667            var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
668
669            return Tuple.Create(l.Item1 * r.Item1, l.Item2 * r.Item1 + l.Item1 * r.Item2);
670          }
671
672        case "-": {
673            var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
674            var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
675
676            return Tuple.Create(l.Item1 - r.Item1, l.Item2 - r.Item2);
677          }
678        case "%": {
679            var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
680            var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
681
682            // protected division
683            if(r.Item1.IsAlmost(0.0)) {
684              return Tuple.Create(0.0, Vector.Zero);
685            } else {
686              return Tuple.Create(
687                l.Item1 / r.Item1,
688                l.Item1 * -1.0 / (r.Item1 * r.Item1) * r.Item2 + 1.0 / r.Item1 * l.Item2 // (f/g)' = f * (1/g)' + 1/g * f' = f * -1/g² * g' + 1/g * f'
689                );
690            }
691          }
692        default: {
693            // distinguish other cases
694            if(IsConstantNode(node)) {
695              var vArr = new double[parameterValues.Length]; // backing array for vector
696              vArr[nodeIdx[node]] = 1.0;
697              var g = new Vector(vArr);
698              return Tuple.Create(parameterValues[nodeIdx[node]], g);
699            } else {
700              // assume a variable name
701              var varName = node.Symbol.Name;
702              return variableValues[varName];
703            }
704          }
705      }
706    }
707    #endregion
708
709    #region events
710    /*
711     * Dependencies between parameters:
712     *
713     * ProblemData
714     *    |
715     *    V
716     * TargetVariables   FunctionSet    MaximumLength    NumberOfLatentVariables
717     *               |   |                 |                   |
718     *               V   V                 |                   |
719     *             Grammar <---------------+-------------------
720     *                |
721     *                V
722     *            Encoding
723     */
724    private void RegisterEventHandlers() {
725      ProblemDataParameter.ValueChanged += ProblemDataParameter_ValueChanged;
726      if(ProblemDataParameter.Value != null) ProblemDataParameter.Value.Changed += ProblemData_Changed;
727
728      TargetVariablesParameter.ValueChanged += TargetVariablesParameter_ValueChanged;
729      if(TargetVariablesParameter.Value != null) TargetVariablesParameter.Value.CheckedItemsChanged += CheckedTargetVariablesChanged;
730
731      FunctionSetParameter.ValueChanged += FunctionSetParameter_ValueChanged;
732      if(FunctionSetParameter.Value != null) FunctionSetParameter.Value.CheckedItemsChanged += CheckedFunctionsChanged;
733
734      MaximumLengthParameter.Value.ValueChanged += MaximumLengthChanged;
735
736      NumberOfLatentVariablesParameter.Value.ValueChanged += NumLatentVariablesChanged;
737    }
738
739    private void NumLatentVariablesChanged(object sender, EventArgs e) {
740      UpdateGrammarAndEncoding();
741    }
742
743    private void MaximumLengthChanged(object sender, EventArgs e) {
744      UpdateGrammarAndEncoding();
745    }
746
747    private void FunctionSetParameter_ValueChanged(object sender, EventArgs e) {
748      FunctionSetParameter.Value.CheckedItemsChanged += CheckedFunctionsChanged;
749    }
750
751    private void CheckedFunctionsChanged(object sender, CollectionItemsChangedEventArgs<StringValue> e) {
752      UpdateGrammarAndEncoding();
753    }
754
755    private void TargetVariablesParameter_ValueChanged(object sender, EventArgs e) {
756      TargetVariablesParameter.Value.CheckedItemsChanged += CheckedTargetVariablesChanged;
757    }
758
759    private void CheckedTargetVariablesChanged(object sender, CollectionItemsChangedEventArgs<StringValue> e) {
760      UpdateGrammarAndEncoding();
761    }
762
763    private void ProblemDataParameter_ValueChanged(object sender, EventArgs e) {
764      ProblemDataParameter.Value.Changed += ProblemData_Changed;
765      OnProblemDataChanged();
766      OnReset();
767    }
768
769    private void ProblemData_Changed(object sender, EventArgs e) {
770      OnProblemDataChanged();
771      OnReset();
772    }
773
774    private void OnProblemDataChanged() {
775      UpdateTargetVariables();        // implicitly updates other dependent parameters
776      var handler = ProblemDataChanged;
777      if(handler != null) handler(this, EventArgs.Empty);
778    }
779
780    #endregion
781
782    #region  helper
783
784    private void InitAllParameters() {
785      UpdateTargetVariables(); // implicitly updates the grammar and the encoding     
786    }
787
788    private ReadOnlyCheckedItemCollection<StringValue> CreateFunctionSet() {
789      var l = new CheckedItemCollection<StringValue>();
790      l.Add(new StringValue("+").AsReadOnly());
791      l.Add(new StringValue("*").AsReadOnly());
792      l.Add(new StringValue("%").AsReadOnly());
793      l.Add(new StringValue("-").AsReadOnly());
794      return l.AsReadOnly();
795    }
796
797    private static bool IsConstantNode(ISymbolicExpressionTreeNode n) {
798      return n.Symbol.Name.StartsWith("θ");
799    }
800    private static bool IsLatentVariableNode(ISymbolicExpressionTreeNode n) {
801      return n.Symbol.Name.StartsWith("λ");
802    }
803
804
805    private void UpdateTargetVariables() {
806      var currentlySelectedVariables = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
807
808      var newVariablesList = new CheckedItemCollection<StringValue>(ProblemData.Dataset.VariableNames.Select(str => new StringValue(str).AsReadOnly()).ToArray()).AsReadOnly();
809      var matchingItems = newVariablesList.Where(item => currentlySelectedVariables.Contains(item.Value)).ToArray();
810      foreach(var matchingItem in matchingItems) {
811        newVariablesList.SetItemCheckedState(matchingItem, true);
812      }
813      TargetVariablesParameter.Value = newVariablesList;
814    }
815
816    private void UpdateGrammarAndEncoding() {
817      var encoding = new MultiEncoding();
818      var g = CreateGrammar();
819      foreach(var targetVar in TargetVariables.CheckedItems) {
820        encoding = encoding.Add(new SymbolicExpressionTreeEncoding(targetVar + "_tree", g, MaximumLength, MaximumLength)); // only limit by length
821      }
822      for(int i = 1; i <= NumberOfLatentVariables; i++) {
823        encoding = encoding.Add(new SymbolicExpressionTreeEncoding("λ" + i + "_tree", g, MaximumLength, MaximumLength));
824      }
825      Encoding = encoding;
826    }
827
828    private ISymbolicExpressionGrammar CreateGrammar() {
829      // whenever ProblemData is changed we create a new grammar with the necessary symbols
830      var g = new SimpleSymbolicExpressionGrammar();
831      g.AddSymbols(FunctionSet.CheckedItems.Select(i => i.Value).ToArray(), 2, 2);
832
833      // TODO
834      //g.AddSymbols(new[] {
835      //  "exp",
836      //  "log", // log( <expr> ) // TODO: init a theta to ensure the value is always positive
837      //  "exp_minus" // exp((-1) * <expr>
838      //}, 1, 1);
839
840      foreach(var variableName in ProblemData.AllowedInputVariables.Union(TargetVariables.CheckedItems.Select(i => i.Value)))
841        g.AddTerminalSymbol(variableName);
842
843      // generate symbols for numeric parameters for which the value is optimized using AutoDiff
844      // we generate multiple symbols to balance the probability for selecting a numeric parameter in the generation of random trees
845      var numericConstantsFactor = 2.0;
846      for(int i = 0; i < numericConstantsFactor * (ProblemData.AllowedInputVariables.Count() + TargetVariables.CheckedItems.Count()); i++) {
847        g.AddTerminalSymbol("θ" + i); // numeric parameter for which the value is optimized using AutoDiff
848      }
849
850      // generate symbols for latent variables
851      for(int i = 1; i <= NumberOfLatentVariables; i++) {
852        g.AddTerminalSymbol("λ" + i); // numeric parameter for which the value is optimized using AutoDiff
853      }
854
855      return g;
856    }
857
858    #endregion
859
860    #region Import & Export
861    public void Load(IRegressionProblemData data) {
862      Name = data.Name;
863      Description = data.Description;
864      ProblemData = data;
865    }
866
867    public IRegressionProblemData Export() {
868      return ProblemData;
869    }
870    #endregion
871
872  }
873}
Note: See TracBrowser for help on using the repository browser.