source: branches/2925_AutoDiffForDynamicalModels/HeuristicLab.Problems.DynamicalSystemsModelling/3.3/Problem.cs @ 16215

Last change on this file since 16215 was 16215, checked in by gkronber, 4 years ago

#2925: added sin and cos functions

File size: 42.8 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Diagnostics;
25using System.Linq;
26using HeuristicLab.Analysis;
27using HeuristicLab.Collections;
28using HeuristicLab.Common;
29using HeuristicLab.Core;
30using HeuristicLab.Data;
31using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
32using HeuristicLab.Optimization;
33using HeuristicLab.Parameters;
34using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
35using HeuristicLab.Problems.DataAnalysis;
36using HeuristicLab.Problems.DataAnalysis.Symbolic;
37using HeuristicLab.Problems.Instances;
38using Variable = HeuristicLab.Problems.DataAnalysis.Symbolic.Variable;
39
40namespace HeuristicLab.Problems.DynamicalSystemsModelling {
41  public class Vector {
42    public readonly static Vector Zero = new Vector(new double[0]);
43
44    public static Vector operator +(Vector a, Vector b) {
45      if (a == Zero) return b;
46      if (b == Zero) return a;
47      Debug.Assert(a.arr.Length == b.arr.Length);
48      var res = new double[a.arr.Length];
49      for (int i = 0; i < res.Length; i++)
50        res[i] = a.arr[i] + b.arr[i];
51      return new Vector(res);
52    }
53    public static Vector operator -(Vector a, Vector b) {
54      if (b == Zero) return a;
55      if (a == Zero) return -b;
56      Debug.Assert(a.arr.Length == b.arr.Length);
57      var res = new double[a.arr.Length];
58      for (int i = 0; i < res.Length; i++)
59        res[i] = a.arr[i] - b.arr[i];
60      return new Vector(res);
61    }
62    public static Vector operator -(Vector v) {
63      if (v == Zero) return Zero;
64      for (int i = 0; i < v.arr.Length; i++)
65        v.arr[i] = -v.arr[i];
66      return v;
67    }
68
69    public static Vector operator *(double s, Vector v) {
70      if (v == Zero) return Zero;
71      if (s == 0.0) return Zero;
72      var res = new double[v.arr.Length];
73      for (int i = 0; i < res.Length; i++)
74        res[i] = s * v.arr[i];
75      return new Vector(res);
76    }
77
78    public static Vector operator *(Vector v, double s) {
79      return s * v;
80    }
81    public static Vector operator *(Vector u, Vector v) {
82      if (v == Zero) return Zero;
83      if (u == Zero) return Zero;
84      var res = new double[v.arr.Length];
85      for (int i = 0; i < res.Length; i++)
86        res[i] = u.arr[i] * v.arr[i];
87      return new Vector(res);
88    }
89    public static Vector operator /(double s, Vector v) {
90      if (s == 0.0) return Zero;
91      if (v == Zero) throw new ArgumentException("Division by zero vector");
92      var res = new double[v.arr.Length];
93      for (int i = 0; i < res.Length; i++)
94        res[i] = 1.0 / v.arr[i];
95      return new Vector(res);
96    }
97    public static Vector operator /(Vector v, double s) {
98      return v * 1.0 / s;
99    }
100
101    public static Vector Sin(Vector s) {
102      var res = new double[s.arr.Length];
103      for (int i = 0; i < res.Length; i++) res[i] = Math.Sin(s.arr[i]);
104      return new Vector(res);
105    }
106    public static Vector Cos(Vector s) {
107      var res = new double[s.arr.Length];
108      for (int i = 0; i < res.Length; i++) res[i] = Math.Cos(s.arr[i]);
109      return new Vector(res);
110    }
111
112    private readonly double[] arr; // backing array;
113
114    public Vector(double[] v) {
115      this.arr = v;
116    }
117
118    public void CopyTo(double[] target) {
119      Debug.Assert(arr.Length <= target.Length);
120      Array.Copy(arr, target, arr.Length);
121    }
122  }
123
124  [Item("Dynamical Systems Modelling Problem", "TODO")]
125  [Creatable(CreatableAttribute.Categories.GeneticProgrammingProblems, Priority = 900)]
126  [StorableClass]
127  public sealed class Problem : SingleObjectiveBasicProblem<MultiEncoding>, IRegressionProblem, IProblemInstanceConsumer<IRegressionProblemData>, IProblemInstanceExporter<IRegressionProblemData> {
128
129    #region parameter names
130    private const string ProblemDataParameterName = "Data";
131    private const string TargetVariablesParameterName = "Target variables";
132    private const string FunctionSetParameterName = "Function set";
133    private const string MaximumLengthParameterName = "Size limit";
134    private const string MaximumParameterOptimizationIterationsParameterName = "Max. parameter optimization iterations";
135    private const string NumberOfLatentVariablesParameterName = "Number of latent variables";
136    private const string NumericIntegrationStepsParameterName = "Steps for numeric integration";
137    private const string TrainingEpisodesParameterName = "Training episodes";
138    private const string OptimizeParametersForEpisodesParameterName = "Optimize parameters for episodes";
139    #endregion
140
141    #region Parameter Properties
142    IParameter IDataAnalysisProblem.ProblemDataParameter { get { return ProblemDataParameter; } }
143
144    public IValueParameter<IRegressionProblemData> ProblemDataParameter {
145      get { return (IValueParameter<IRegressionProblemData>)Parameters[ProblemDataParameterName]; }
146    }
147    public IValueParameter<ReadOnlyCheckedItemCollection<StringValue>> TargetVariablesParameter {
148      get { return (IValueParameter<ReadOnlyCheckedItemCollection<StringValue>>)Parameters[TargetVariablesParameterName]; }
149    }
150    public IValueParameter<ReadOnlyCheckedItemCollection<StringValue>> FunctionSetParameter {
151      get { return (IValueParameter<ReadOnlyCheckedItemCollection<StringValue>>)Parameters[FunctionSetParameterName]; }
152    }
153    public IFixedValueParameter<IntValue> MaximumLengthParameter {
154      get { return (IFixedValueParameter<IntValue>)Parameters[MaximumLengthParameterName]; }
155    }
156    public IFixedValueParameter<IntValue> MaximumParameterOptimizationIterationsParameter {
157      get { return (IFixedValueParameter<IntValue>)Parameters[MaximumParameterOptimizationIterationsParameterName]; }
158    }
159    public IFixedValueParameter<IntValue> NumberOfLatentVariablesParameter {
160      get { return (IFixedValueParameter<IntValue>)Parameters[NumberOfLatentVariablesParameterName]; }
161    }
162    public IFixedValueParameter<IntValue> NumericIntegrationStepsParameter {
163      get { return (IFixedValueParameter<IntValue>)Parameters[NumericIntegrationStepsParameterName]; }
164    }
165    public IValueParameter<ItemList<IntRange>> TrainingEpisodesParameter {
166      get { return (IValueParameter<ItemList<IntRange>>)Parameters[TrainingEpisodesParameterName]; }
167    }
168    public IFixedValueParameter<BoolValue> OptimizeParametersForEpisodesParameter {
169      get { return (IFixedValueParameter<BoolValue>)Parameters[OptimizeParametersForEpisodesParameterName]; }
170    }
171    #endregion
172
173    #region Properties
174    public IRegressionProblemData ProblemData {
175      get { return ProblemDataParameter.Value; }
176      set { ProblemDataParameter.Value = value; }
177    }
178    IDataAnalysisProblemData IDataAnalysisProblem.ProblemData { get { return ProblemData; } }
179
180    public ReadOnlyCheckedItemCollection<StringValue> TargetVariables {
181      get { return TargetVariablesParameter.Value; }
182    }
183
184    public ReadOnlyCheckedItemCollection<StringValue> FunctionSet {
185      get { return FunctionSetParameter.Value; }
186    }
187
188    public int MaximumLength {
189      get { return MaximumLengthParameter.Value.Value; }
190    }
191    public int MaximumParameterOptimizationIterations {
192      get { return MaximumParameterOptimizationIterationsParameter.Value.Value; }
193    }
194    public int NumberOfLatentVariables {
195      get { return NumberOfLatentVariablesParameter.Value.Value; }
196    }
197    public int NumericIntegrationSteps {
198      get { return NumericIntegrationStepsParameter.Value.Value; }
199    }
200    public IEnumerable<IntRange> TrainingEpisodes {
201      get { return TrainingEpisodesParameter.Value; }
202    }
203    public bool OptimizeParametersForEpisodes {
204      get { return OptimizeParametersForEpisodesParameter.Value.Value; }
205    }
206
207    #endregion
208
209    public event EventHandler ProblemDataChanged;
210
211    public override bool Maximization {
212      get { return false; } // we minimize NMSE
213    }
214
215    #region item cloning and persistence
216    // persistence
217    [StorableConstructor]
218    private Problem(bool deserializing) : base(deserializing) { }
219    [StorableHook(HookType.AfterDeserialization)]
220    private void AfterDeserialization() {
221      if (!Parameters.ContainsKey(OptimizeParametersForEpisodesParameterName)) {
222        Parameters.Add(new FixedValueParameter<BoolValue>(OptimizeParametersForEpisodesParameterName, "Flag to select if parameters should be optimized globally or for each episode individually.", new BoolValue(false)));
223      }
224      RegisterEventHandlers();
225    }
226
227    // cloning
228    private Problem(Problem original, Cloner cloner)
229      : base(original, cloner) {
230      RegisterEventHandlers();
231    }
232    public override IDeepCloneable Clone(Cloner cloner) { return new Problem(this, cloner); }
233    #endregion
234
235    public Problem()
236      : base() {
237      var targetVariables = new CheckedItemCollection<StringValue>().AsReadOnly(); // HACK: it would be better to provide a new class derived from IDataAnalysisProblem
238      var functions = CreateFunctionSet();
239      Parameters.Add(new ValueParameter<IRegressionProblemData>(ProblemDataParameterName, "The data captured from the dynamical system. Use CSV import functionality to import data.", new RegressionProblemData()));
240      Parameters.Add(new ValueParameter<ReadOnlyCheckedItemCollection<StringValue>>(TargetVariablesParameterName, "Target variables (overrides setting in ProblemData)", targetVariables));
241      Parameters.Add(new ValueParameter<ReadOnlyCheckedItemCollection<StringValue>>(FunctionSetParameterName, "The list of allowed functions", functions));
242      Parameters.Add(new FixedValueParameter<IntValue>(MaximumLengthParameterName, "The maximally allowed length of each expression. Set to a small value (5 - 25). Default = 10", new IntValue(10)));
243      Parameters.Add(new FixedValueParameter<IntValue>(MaximumParameterOptimizationIterationsParameterName, "The maximum number of iterations for optimization of parameters (using L-BFGS). More iterations makes the algorithm slower, fewer iterations might prevent convergence in the optimization scheme. Default = 100", new IntValue(100)));
244      Parameters.Add(new FixedValueParameter<IntValue>(NumberOfLatentVariablesParameterName, "Latent variables (unobserved variables) allow us to produce expressions which are integrated up and can be used in other expressions. They are handled similarly to target variables in forward simulation / integration. The difference to target variables is that there are no data to which the calculated values of latent variables are compared. Set to a small value (0 .. 5) as necessary (default = 0)", new IntValue(0)));
245      Parameters.Add(new FixedValueParameter<IntValue>(NumericIntegrationStepsParameterName, "Number of steps in the numeric integration that are taken from one row to the next (set to 1 to 100). More steps makes the algorithm slower, less steps worsens the accuracy of the numeric integration scheme.", new IntValue(10)));
246      Parameters.Add(new ValueParameter<ItemList<IntRange>>(TrainingEpisodesParameterName, "A list of ranges that should be used for training, each range represents an independent episode. This overrides the TrainingSet parameter in ProblemData.", new ItemList<IntRange>()));
247      Parameters.Add(new FixedValueParameter<BoolValue>(OptimizeParametersForEpisodesParameterName, "Flag to select if parameters should be optimized globally or for each episode individually.", new BoolValue(false)));
248      RegisterEventHandlers();
249      InitAllParameters();
250
251      // TODO: do not clear selection of target variables when the input variables are changed (keep selected target variables)
252      // TODO: UI hangs when selecting / deselecting input variables because the encoding is updated on each item
253      // TODO: use training range as default training episode
254
255    }
256
257    public override double Evaluate(Individual individual, IRandom random) {
258      var trees = individual.Values.Select(v => v.Value).OfType<ISymbolicExpressionTree>().ToArray(); // extract all trees from individual
259
260      if (OptimizeParametersForEpisodes) {
261        int eIdx = 0;
262        double totalNMSE = 0.0;
263        int totalSize = 0;
264        foreach (var episode in TrainingEpisodes) {
265          double[] optTheta;
266          double nmse;
267          OptimizeForEpisodes(trees, random, new[] { episode }, out optTheta, out nmse);
268          individual["OptTheta_" + eIdx] = new DoubleArray(optTheta); // write back optimized parameters so that we can use them in the Analysis method
269          eIdx++;
270          totalNMSE += nmse * episode.Size;
271          totalSize += episode.Size;
272        }
273        return totalNMSE / totalSize;
274      } else {
275        double[] optTheta;
276        double nmse;
277        OptimizeForEpisodes(trees, random, TrainingEpisodes, out optTheta, out nmse);
278        individual["OptTheta"] = new DoubleArray(optTheta); // write back optimized parameters so that we can use them in the Analysis method
279        return nmse;
280      }
281    }
282
283    private void OptimizeForEpisodes(ISymbolicExpressionTree[] trees, IRandom random, IEnumerable<IntRange> episodes, out double[] optTheta, out double nmse) {
284      var rows = episodes.SelectMany(e => Enumerable.Range(e.Start, e.End - e.Start)).ToArray();
285      var problemData = ProblemData;
286      var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
287      var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees
288      var targetValues = new double[rows.Length, targetVars.Length];
289
290      // collect values of all target variables
291      var colIdx = 0;
292      foreach (var targetVar in targetVars) {
293        int rowIdx = 0;
294        foreach (var value in problemData.Dataset.GetDoubleValues(targetVar, rows)) {
295          targetValues[rowIdx, colIdx] = value;
296          rowIdx++;
297        }
298        colIdx++;
299      }
300
301      var nodeIdx = new Dictionary<ISymbolicExpressionTreeNode, int>();
302
303      foreach (var tree in trees) {
304        foreach (var node in tree.Root.IterateNodesPrefix().Where(n => IsConstantNode(n))) {
305          nodeIdx.Add(node, nodeIdx.Count);
306        }
307      }
308
309      var theta = nodeIdx.Select(_ => random.NextDouble() * 2.0 - 1.0).ToArray(); // init params randomly from Unif(-1,1)
310
311      optTheta = new double[0];
312      if (theta.Length > 0) {
313        alglib.minlbfgsstate state;
314        alglib.minlbfgsreport report;
315        alglib.minlbfgscreate(Math.Min(theta.Length, 5), theta, out state);
316        alglib.minlbfgssetcond(state, 0.0, 0.0, 0.0, MaximumParameterOptimizationIterations);
317        alglib.minlbfgsoptimize(state, EvaluateObjectiveAndGradient, null,
318          new object[] { trees, targetVars, problemData, nodeIdx, targetValues, episodes.ToArray(), NumericIntegrationSteps, latentVariables }); //TODO: create a type
319        alglib.minlbfgsresults(state, out optTheta, out report);
320
321        /*
322         *
323         *         L-BFGS algorithm results
324
325          INPUT PARAMETERS:
326              State   -   algorithm state
327
328          OUTPUT PARAMETERS:
329              X       -   array[0..N-1], solution
330              Rep     -   optimization report:
331                          * Rep.TerminationType completetion code:
332                              * -7    gradient verification failed.
333                                      See MinLBFGSSetGradientCheck() for more information.
334                              * -2    rounding errors prevent further improvement.
335                                      X contains best point found.
336                              * -1    incorrect parameters were specified
337                              *  1    relative function improvement is no more than
338                                      EpsF.
339                              *  2    relative step is no more than EpsX.
340                              *  4    gradient norm is no more than EpsG
341                              *  5    MaxIts steps was taken
342                              *  7    stopping conditions are too stringent,
343                                      further improvement is impossible
344                          * Rep.IterationsCount contains iterations count
345                          * NFEV countains number of function calculations
346         */
347        if (report.terminationtype < 0) { nmse = 10E6; return; }
348      }
349
350      // perform evaluation for optimal theta to get quality value
351      double[] grad = new double[optTheta.Length];
352      nmse = double.NaN;
353      EvaluateObjectiveAndGradient(optTheta, ref nmse, grad,
354        new object[] { trees, targetVars, problemData, nodeIdx, targetValues, episodes.ToArray(), NumericIntegrationSteps, latentVariables });
355      if (double.IsNaN(nmse) || double.IsInfinity(nmse)) { nmse = 10E6; return; } // return a large value (TODO: be consistent by using NMSE)
356    }
357
358    private static void EvaluateObjectiveAndGradient(double[] x, ref double f, double[] grad, object obj) {
359      var trees = (ISymbolicExpressionTree[])((object[])obj)[0];
360      var targetVariables = (string[])((object[])obj)[1];
361      var problemData = (IRegressionProblemData)((object[])obj)[2];
362      var nodeIdx = (Dictionary<ISymbolicExpressionTreeNode, int>)((object[])obj)[3];
363      var targetValues = (double[,])((object[])obj)[4];
364      var episodes = (IntRange[])((object[])obj)[5];
365      var numericIntegrationSteps = (int)((object[])obj)[6];
366      var latentVariables = (string[])((object[])obj)[7];
367
368      var predicted = Integrate(
369        trees,  // we assume trees contain expressions for the change of each target variable over time y'(t)
370        problemData.Dataset,
371        problemData.AllowedInputVariables.ToArray(),
372        targetVariables,
373        latentVariables,
374        episodes,
375        nodeIdx,
376        x, numericIntegrationSteps).ToArray();
377
378
379      // for normalized MSE = 1/variance(t) * MSE(t, pred)
380      // TODO: Perf. (by standardization of target variables before evaluation of all trees)     
381      var invVar = Enumerable.Range(0, targetVariables.Length)
382        .Select(c => Enumerable.Range(0, targetValues.GetLength(0)).Select(row => targetValues[row, c])) // column vectors
383        .Select(vec => vec.Variance())
384        .Select(v => 1.0 / v)
385        .ToArray();
386
387      // objective function is NMSE
388      f = 0.0;
389      int n = predicted.Length;
390      double invN = 1.0 / n;
391      var g = Vector.Zero;
392      int r = 0;
393      foreach (var y_pred in predicted) {
394        for (int c = 0; c < y_pred.Length; c++) {
395
396          var y_pred_f = y_pred[c].Item1;
397          var y = targetValues[r, c];
398
399          var res = (y - y_pred_f);
400          var ressq = res * res;
401          f += ressq * invN * invVar[c];
402          g += -2.0 * res * y_pred[c].Item2 * invN * invVar[c];
403        }
404        r++;
405      }
406
407      g.CopyTo(grad);
408    }
409
410    public override void Analyze(Individual[] individuals, double[] qualities, ResultCollection results, IRandom random) {
411      base.Analyze(individuals, qualities, results, random);
412
413      if (!results.ContainsKey("Prediction (training)")) {
414        results.Add(new Result("Prediction (training)", typeof(ReadOnlyItemList<DataTable>)));
415      }
416      if (!results.ContainsKey("Prediction (test)")) {
417        results.Add(new Result("Prediction (test)", typeof(ReadOnlyItemList<DataTable>)));
418      }
419      if (!results.ContainsKey("Models")) {
420        results.Add(new Result("Models", typeof(VariableCollection)));
421      }
422
423      var bestIndividualAndQuality = this.GetBestIndividual(individuals, qualities);
424      var trees = bestIndividualAndQuality.Item1.Values.Select(v => v.Value).OfType<ISymbolicExpressionTree>().ToArray(); // extract all trees from individual
425
426      // TODO extract common functionality from Evaluate and Analyze
427      var nodeIdx = new Dictionary<ISymbolicExpressionTreeNode, int>();
428      foreach (var tree in trees) {
429        foreach (var node in tree.Root.IterateNodesPrefix().Where(n => IsConstantNode(n))) {
430          nodeIdx.Add(node, nodeIdx.Count);
431        }
432      }
433      var problemData = ProblemData;
434      var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
435      var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees
436
437      var trainingList = new ItemList<DataTable>();
438
439      if (OptimizeParametersForEpisodes) {
440        var eIdx = 0;
441        var trainingPredictions = new List<Tuple<double, Vector>[][]>();
442        foreach (var episode in TrainingEpisodes) {
443          var episodes = new[] { episode };
444          var optTheta = ((DoubleArray)bestIndividualAndQuality.Item1["OptTheta_" + eIdx]).ToArray(); // see evaluate
445          var trainingPrediction = Integrate(
446                                   trees,  // we assume trees contain expressions for the change of each target variable over time y'(t)
447                                   problemData.Dataset,
448                                   problemData.AllowedInputVariables.ToArray(),
449                                   targetVars,
450                                   latentVariables,
451                                   episodes,
452                                   nodeIdx,
453                                   optTheta,
454                                   NumericIntegrationSteps).ToArray();
455          trainingPredictions.Add(trainingPrediction);
456          eIdx++;
457        }
458
459        // only for actual target values
460        var trainingRows = TrainingEpisodes.SelectMany(e => Enumerable.Range(e.Start, e.End - e.Start));
461        for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
462          var targetVar = targetVars[colIdx];
463          var trainingDataTable = new DataTable(targetVar + " prediction (training)");
464          var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, trainingRows));
465          var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, trainingPredictions.SelectMany(arr => arr.Select(row => row[colIdx].Item1)).ToArray());
466          trainingDataTable.Rows.Add(actualValuesRow);
467          trainingDataTable.Rows.Add(predictedValuesRow);
468          trainingList.Add(trainingDataTable);
469        }
470        results["Prediction (training)"].Value = trainingList.AsReadOnly();
471
472
473        var models = new VariableCollection();
474
475        foreach (var tup in targetVars.Zip(trees, Tuple.Create)) {
476          var targetVarName = tup.Item1;
477          var tree = tup.Item2;
478
479          var origTreeVar = new HeuristicLab.Core.Variable(targetVarName + "(original)");
480          origTreeVar.Value = (ISymbolicExpressionTree)tree.Clone();
481          models.Add(origTreeVar);
482        }
483        results["Models"].Value = models;
484      } else {
485        var optTheta = ((DoubleArray)bestIndividualAndQuality.Item1["OptTheta"]).ToArray(); // see evaluate
486        var trainingPrediction = Integrate(
487                                   trees,  // we assume trees contain expressions for the change of each target variable over time y'(t)
488                                   problemData.Dataset,
489                                   problemData.AllowedInputVariables.ToArray(),
490                                   targetVars,
491                                   latentVariables,
492                                   TrainingEpisodes,
493                                   nodeIdx,
494                                   optTheta,
495                                   NumericIntegrationSteps).ToArray();
496        // only for actual target values
497        var trainingRows = TrainingEpisodes.SelectMany(e => Enumerable.Range(e.Start, e.End - e.Start));
498        for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
499          var targetVar = targetVars[colIdx];
500          var trainingDataTable = new DataTable(targetVar + " prediction (training)");
501          var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, trainingRows));
502          var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, trainingPrediction.Select(arr => arr[colIdx].Item1).ToArray());
503          trainingDataTable.Rows.Add(actualValuesRow);
504          trainingDataTable.Rows.Add(predictedValuesRow);
505          trainingList.Add(trainingDataTable);
506        }
507        // TODO: DRY for training and test
508        var testList = new ItemList<DataTable>();
509        var testRows = ProblemData.TestIndices.ToArray();
510        var testPrediction = Integrate(
511         trees,  // we assume trees contain expressions for the change of each target variable over time y'(t)
512         problemData.Dataset,
513         problemData.AllowedInputVariables.ToArray(),
514         targetVars,
515         latentVariables,
516         new IntRange[] { ProblemData.TestPartition },
517         nodeIdx,
518         optTheta,
519         NumericIntegrationSteps).ToArray();
520
521        for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
522          var targetVar = targetVars[colIdx];
523          var testDataTable = new DataTable(targetVar + " prediction (test)");
524          var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, testRows));
525          var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, testPrediction.Select(arr => arr[colIdx].Item1).ToArray());
526          testDataTable.Rows.Add(actualValuesRow);
527          testDataTable.Rows.Add(predictedValuesRow);
528          testList.Add(testDataTable);
529        }
530
531        results["Prediction (training)"].Value = trainingList.AsReadOnly();
532        results["Prediction (test)"].Value = testList.AsReadOnly();
533        #region simplification of models
534        // TODO the dependency of HeuristicLab.Problems.DataAnalysis.Symbolic is not ideal
535        var models = new VariableCollection();    // to store target var names and original version of tree
536
537        foreach (var tup in targetVars.Zip(trees, Tuple.Create)) {
538          var targetVarName = tup.Item1;
539          var tree = tup.Item2;
540
541          // when we reference HeuristicLab.Problems.DataAnalysis.Symbolic we can translate symbols
542          int nextParIdx = 0;
543          var shownTree = new SymbolicExpressionTree(TranslateTreeNode(tree.Root, optTheta, ref nextParIdx));
544
545          // var shownTree = (SymbolicExpressionTree)tree.Clone();
546          // var constantsNodeOrig = tree.IterateNodesPrefix().Where(IsConstantNode);
547          // var constantsNodeShown = shownTree.IterateNodesPrefix().Where(IsConstantNode);
548          //
549          // foreach (var n in constantsNodeOrig.Zip(constantsNodeShown, (original, shown) => new { original, shown })) {
550          //   double constantsVal = optTheta[nodeIdx[n.original]];
551          //
552          //   ConstantTreeNode replacementNode = new ConstantTreeNode(new Constant()) { Value = constantsVal };
553          //
554          //   var parentNode = n.shown.Parent;
555          //   int replacementIndex = parentNode.IndexOfSubtree(n.shown);
556          //   parentNode.RemoveSubtree(replacementIndex);
557          //   parentNode.InsertSubtree(replacementIndex, replacementNode);
558          // }
559
560          var origTreeVar = new HeuristicLab.Core.Variable(targetVarName + "(original)");
561          origTreeVar.Value = (ISymbolicExpressionTree)tree.Clone();
562          models.Add(origTreeVar);
563          var simplifiedTreeVar = new HeuristicLab.Core.Variable(targetVarName + "(simplified)");
564          simplifiedTreeVar.Value = TreeSimplifier.Simplify(shownTree);
565          models.Add(simplifiedTreeVar);
566
567        }
568        results["Models"].Value = models;
569        #endregion
570      }
571    }
572
573    private ISymbolicExpressionTreeNode TranslateTreeNode(ISymbolicExpressionTreeNode n, double[] parameterValues, ref int nextParIdx) {
574      ISymbolicExpressionTreeNode translatedNode = null;
575      if (n.Symbol is StartSymbol) {
576        translatedNode = new StartSymbol().CreateTreeNode();
577      } else if (n.Symbol is ProgramRootSymbol) {
578        translatedNode = new ProgramRootSymbol().CreateTreeNode();
579      } else if (n.Symbol.Name == "+") {
580        translatedNode = new Addition().CreateTreeNode();
581      } else if (n.Symbol.Name == "-") {
582        translatedNode = new Subtraction().CreateTreeNode();
583      } else if (n.Symbol.Name == "*") {
584        translatedNode = new Multiplication().CreateTreeNode();
585      } else if (n.Symbol.Name == "%") {
586        translatedNode = new Division().CreateTreeNode();
587      } else if (n.Symbol.Name == "sin") {
588        translatedNode = new Sine().CreateTreeNode();
589      } else if (n.Symbol.Name == "cos") {
590        translatedNode = new Cosine().CreateTreeNode();
591      } else if (IsConstantNode(n)) {
592        var constNode = (ConstantTreeNode)new Constant().CreateTreeNode();
593        constNode.Value = parameterValues[nextParIdx];
594        nextParIdx++;
595        translatedNode = constNode;
596      } else {
597        // assume a variable name
598        var varName = n.Symbol.Name;
599        var varNode = (VariableTreeNode)new Variable().CreateTreeNode();
600        varNode.Weight = 1.0;
601        varNode.VariableName = varName;
602        translatedNode = varNode;
603      }
604      foreach (var child in n.Subtrees) {
605        translatedNode.AddSubtree(TranslateTreeNode(child, parameterValues, ref nextParIdx));
606      }
607      return translatedNode;
608    }
609
610    #region interpretation
611    private static IEnumerable<Tuple<double, Vector>[]> Integrate(
612      ISymbolicExpressionTree[] trees, IDataset dataset, string[] inputVariables, string[] targetVariables, string[] latentVariables, IEnumerable<IntRange> episodes,
613      Dictionary<ISymbolicExpressionTreeNode, int> nodeIdx, double[] parameterValues, int numericIntegrationSteps = 100) {
614
615      int NUM_STEPS = numericIntegrationSteps;
616      double h = 1.0 / NUM_STEPS;
617
618      foreach (var episode in episodes) {
619        var rows = Enumerable.Range(episode.Start, episode.End - episode.Start);
620        // return first value as stored in the dataset
621        yield return targetVariables
622          .Select(targetVar => Tuple.Create(dataset.GetDoubleValue(targetVar, rows.First()), Vector.Zero))
623          .ToArray();
624
625        // integrate forward starting with known values for the target in t0
626
627        var variableValues = new Dictionary<string, Tuple<double, Vector>>();
628        var t0 = rows.First();
629        foreach (var varName in inputVariables) {
630          variableValues.Add(varName, Tuple.Create(dataset.GetDoubleValue(varName, t0), Vector.Zero));
631        }
632        foreach (var varName in targetVariables) {
633          variableValues.Add(varName, Tuple.Create(dataset.GetDoubleValue(varName, t0), Vector.Zero));
634        }
635        // add value entries for latent variables which are also integrated
636        foreach (var latentVar in latentVariables) {
637          variableValues.Add(latentVar, Tuple.Create(0.0, Vector.Zero)); // we don't have observations for latent variables -> assume zero as starting value
638        }
639        var calculatedVariables = targetVariables.Concat(latentVariables); // TODO: must conincide with the order of trees in the encoding
640
641        foreach (var t in rows.Skip(1)) {
642          for (int step = 0; step < NUM_STEPS; step++) {
643            var deltaValues = new Dictionary<string, Tuple<double, Vector>>();
644            foreach (var tup in trees.Zip(calculatedVariables, Tuple.Create)) {
645              var tree = tup.Item1;
646              var targetVarName = tup.Item2;
647              // skip programRoot and startSymbol
648              var res = InterpretRec(tree.Root.GetSubtree(0).GetSubtree(0), variableValues, nodeIdx, parameterValues);
649              deltaValues.Add(targetVarName, res);
650            }
651
652            // update variableValues for next step
653            foreach (var kvp in deltaValues) {
654              var oldVal = variableValues[kvp.Key];
655              variableValues[kvp.Key] = Tuple.Create(
656                oldVal.Item1 + h * kvp.Value.Item1,
657                oldVal.Item2 + h * kvp.Value.Item2
658              );
659            }
660          }
661
662          // only return the target variables for calculation of errors
663          yield return targetVariables
664            .Select(targetVar => variableValues[targetVar])
665            .ToArray();
666
667          // update for next time step
668          foreach (var varName in inputVariables) {
669            variableValues[varName] = Tuple.Create(dataset.GetDoubleValue(varName, t), Vector.Zero);
670          }
671        }
672      }
673    }
674
675    private static Tuple<double, Vector> InterpretRec(
676      ISymbolicExpressionTreeNode node,
677      Dictionary<string, Tuple<double, Vector>> variableValues,
678      Dictionary<ISymbolicExpressionTreeNode, int> nodeIdx,
679      double[] parameterValues
680        ) {
681
682      switch (node.Symbol.Name) {
683        case "+": {
684            var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); // TODO capture all parameters into a state type for interpretation
685            var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
686
687            return Tuple.Create(l.Item1 + r.Item1, l.Item2 + r.Item2);
688          }
689        case "*": {
690            var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
691            var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
692
693            return Tuple.Create(l.Item1 * r.Item1, l.Item2 * r.Item1 + l.Item1 * r.Item2);
694          }
695
696        case "-": {
697            var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
698            var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
699
700            return Tuple.Create(l.Item1 - r.Item1, l.Item2 - r.Item2);
701          }
702        case "%": {
703            var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
704            var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
705
706            // protected division
707            if (r.Item1.IsAlmost(0.0)) {
708              return Tuple.Create(0.0, Vector.Zero);
709            } else {
710              return Tuple.Create(
711                l.Item1 / r.Item1,
712                l.Item1 * -1.0 / (r.Item1 * r.Item1) * r.Item2 + 1.0 / r.Item1 * l.Item2 // (f/g)' = f * (1/g)' + 1/g * f' = f * -1/g² * g' + 1/g * f'
713                );
714            }
715          }
716        case "sin": {
717            var x = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
718            return Tuple.Create(
719              Math.Sin(x.Item1),
720              Vector.Cos(x.Item2) * x.Item2
721            );
722          }
723        case "cos": {
724            var x = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
725            return Tuple.Create(
726              Math.Cos(x.Item1),
727              -Vector.Sin(x.Item2) * x.Item2
728            );
729          }
730        default: {
731            // distinguish other cases
732            if (IsConstantNode(node)) {
733              var vArr = new double[parameterValues.Length]; // backing array for vector
734              vArr[nodeIdx[node]] = 1.0;
735              var g = new Vector(vArr);
736              return Tuple.Create(parameterValues[nodeIdx[node]], g);
737            } else {
738              // assume a variable name
739              var varName = node.Symbol.Name;
740              return variableValues[varName];
741            }
742          }
743      }
744    }
745    #endregion
746
747    #region events
748    /*
749     * Dependencies between parameters:
750     *
751     * ProblemData
752     *    |
753     *    V
754     * TargetVariables   FunctionSet    MaximumLength    NumberOfLatentVariables
755     *               |   |                 |                   |
756     *               V   V                 |                   |
757     *             Grammar <---------------+-------------------
758     *                |
759     *                V
760     *            Encoding
761     */
762    private void RegisterEventHandlers() {
763      ProblemDataParameter.ValueChanged += ProblemDataParameter_ValueChanged;
764      if (ProblemDataParameter.Value != null) ProblemDataParameter.Value.Changed += ProblemData_Changed;
765
766      TargetVariablesParameter.ValueChanged += TargetVariablesParameter_ValueChanged;
767      if (TargetVariablesParameter.Value != null) TargetVariablesParameter.Value.CheckedItemsChanged += CheckedTargetVariablesChanged;
768
769      FunctionSetParameter.ValueChanged += FunctionSetParameter_ValueChanged;
770      if (FunctionSetParameter.Value != null) FunctionSetParameter.Value.CheckedItemsChanged += CheckedFunctionsChanged;
771
772      MaximumLengthParameter.Value.ValueChanged += MaximumLengthChanged;
773
774      NumberOfLatentVariablesParameter.Value.ValueChanged += NumLatentVariablesChanged;
775    }
776
777    private void NumLatentVariablesChanged(object sender, EventArgs e) {
778      UpdateGrammarAndEncoding();
779    }
780
781    private void MaximumLengthChanged(object sender, EventArgs e) {
782      UpdateGrammarAndEncoding();
783    }
784
785    private void FunctionSetParameter_ValueChanged(object sender, EventArgs e) {
786      FunctionSetParameter.Value.CheckedItemsChanged += CheckedFunctionsChanged;
787    }
788
789    private void CheckedFunctionsChanged(object sender, CollectionItemsChangedEventArgs<StringValue> e) {
790      UpdateGrammarAndEncoding();
791    }
792
793    private void TargetVariablesParameter_ValueChanged(object sender, EventArgs e) {
794      TargetVariablesParameter.Value.CheckedItemsChanged += CheckedTargetVariablesChanged;
795    }
796
797    private void CheckedTargetVariablesChanged(object sender, CollectionItemsChangedEventArgs<StringValue> e) {
798      UpdateGrammarAndEncoding();
799    }
800
801    private void ProblemDataParameter_ValueChanged(object sender, EventArgs e) {
802      ProblemDataParameter.Value.Changed += ProblemData_Changed;
803      OnProblemDataChanged();
804      OnReset();
805    }
806
807    private void ProblemData_Changed(object sender, EventArgs e) {
808      OnProblemDataChanged();
809      OnReset();
810    }
811
812    private void OnProblemDataChanged() {
813      UpdateTargetVariables();        // implicitly updates other dependent parameters
814      var handler = ProblemDataChanged;
815      if (handler != null) handler(this, EventArgs.Empty);
816    }
817
818    #endregion
819
820    #region  helper
821
822    private void InitAllParameters() {
823      UpdateTargetVariables(); // implicitly updates the grammar and the encoding     
824    }
825
826    private ReadOnlyCheckedItemCollection<StringValue> CreateFunctionSet() {
827      var l = new CheckedItemCollection<StringValue>();
828      l.Add(new StringValue("+").AsReadOnly());
829      l.Add(new StringValue("*").AsReadOnly());
830      l.Add(new StringValue("%").AsReadOnly());
831      l.Add(new StringValue("-").AsReadOnly());
832      l.Add(new StringValue("sin").AsReadOnly());
833      l.Add(new StringValue("cos").AsReadOnly());
834      return l.AsReadOnly();
835    }
836
837    private static bool IsConstantNode(ISymbolicExpressionTreeNode n) {
838      return n.Symbol.Name.StartsWith("θ");
839    }
840    private static bool IsLatentVariableNode(ISymbolicExpressionTreeNode n) {
841      return n.Symbol.Name.StartsWith("λ");
842    }
843
844
845    private void UpdateTargetVariables() {
846      var currentlySelectedVariables = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
847
848      var newVariablesList = new CheckedItemCollection<StringValue>(ProblemData.Dataset.VariableNames.Select(str => new StringValue(str).AsReadOnly()).ToArray()).AsReadOnly();
849      var matchingItems = newVariablesList.Where(item => currentlySelectedVariables.Contains(item.Value)).ToArray();
850      foreach (var matchingItem in matchingItems) {
851        newVariablesList.SetItemCheckedState(matchingItem, true);
852      }
853      TargetVariablesParameter.Value = newVariablesList;
854    }
855
856    private void UpdateGrammarAndEncoding() {
857      var encoding = new MultiEncoding();
858      var g = CreateGrammar();
859      foreach (var targetVar in TargetVariables.CheckedItems) {
860        encoding = encoding.Add(new SymbolicExpressionTreeEncoding(targetVar + "_tree", g, MaximumLength, MaximumLength)); // only limit by length
861      }
862      for (int i = 1; i <= NumberOfLatentVariables; i++) {
863        encoding = encoding.Add(new SymbolicExpressionTreeEncoding("λ" + i + "_tree", g, MaximumLength, MaximumLength));
864      }
865      Encoding = encoding;
866    }
867
868    private ISymbolicExpressionGrammar CreateGrammar() {
869      // whenever ProblemData is changed we create a new grammar with the necessary symbols
870      var g = new SimpleSymbolicExpressionGrammar();
871      g.AddSymbols(FunctionSet.CheckedItems.Select(i => i.Value).ToArray(), 2, 2);
872
873      // TODO
874      //g.AddSymbols(new[] {
875      //  "exp",
876      //  "log", // log( <expr> ) // TODO: init a theta to ensure the value is always positive
877      //  "exp_minus" // exp((-1) * <expr>
878      //}, 1, 1);
879
880      foreach (var variableName in ProblemData.AllowedInputVariables.Union(TargetVariables.CheckedItems.Select(i => i.Value)))
881        g.AddTerminalSymbol(variableName);
882
883      // generate symbols for numeric parameters for which the value is optimized using AutoDiff
884      // we generate multiple symbols to balance the probability for selecting a numeric parameter in the generation of random trees
885      var numericConstantsFactor = 2.0;
886      for (int i = 0; i < numericConstantsFactor * (ProblemData.AllowedInputVariables.Count() + TargetVariables.CheckedItems.Count()); i++) {
887        g.AddTerminalSymbol("θ" + i); // numeric parameter for which the value is optimized using AutoDiff
888      }
889
890      // generate symbols for latent variables
891      for (int i = 1; i <= NumberOfLatentVariables; i++) {
892        g.AddTerminalSymbol("λ" + i); // numeric parameter for which the value is optimized using AutoDiff
893      }
894
895      return g;
896    }
897
898    #endregion
899
900    #region Import & Export
901    public void Load(IRegressionProblemData data) {
902      Name = data.Name;
903      Description = data.Description;
904      ProblemData = data;
905    }
906
907    public IRegressionProblemData Export() {
908      return ProblemData;
909    }
910    #endregion
911
912  }
913}
Note: See TracBrowser for help on using the repository browser.