[15964] | 1 | #region License Information
|
---|
| 2 | /* HeuristicLab
|
---|
| 3 | * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
|
---|
| 4 | *
|
---|
| 5 | * This file is part of HeuristicLab.
|
---|
| 6 | *
|
---|
| 7 | * HeuristicLab is free software: you can redistribute it and/or modify
|
---|
| 8 | * it under the terms of the GNU General Public License as published by
|
---|
| 9 | * the Free Software Foundation, either version 3 of the License, or
|
---|
| 10 | * (at your option) any later version.
|
---|
| 11 | *
|
---|
| 12 | * HeuristicLab is distributed in the hope that it will be useful,
|
---|
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 15 | * GNU General Public License for more details.
|
---|
| 16 | *
|
---|
| 17 | * You should have received a copy of the GNU General Public License
|
---|
| 18 | * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
|
---|
| 19 | */
|
---|
| 20 | #endregion
|
---|
| 21 |
|
---|
| 22 | using System;
|
---|
| 23 | using System.Collections.Generic;
|
---|
| 24 | using System.Diagnostics;
|
---|
| 25 | using System.Linq;
|
---|
[15968] | 26 | using HeuristicLab.Analysis;
|
---|
| 27 | using HeuristicLab.Collections;
|
---|
[15964] | 28 | using HeuristicLab.Common;
|
---|
| 29 | using HeuristicLab.Core;
|
---|
[15968] | 30 | using HeuristicLab.Data;
|
---|
[15964] | 31 | using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
|
---|
[15968] | 32 | using HeuristicLab.Optimization;
|
---|
[15964] | 33 | using HeuristicLab.Parameters;
|
---|
| 34 | using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
|
---|
| 35 | using HeuristicLab.Problems.DataAnalysis;
|
---|
| 36 | using HeuristicLab.Problems.Instances;
|
---|
| 37 |
|
---|
| 38 | namespace HeuristicLab.Problems.DynamicalSystemsModelling {
|
---|
| 39 | public class Vector {
|
---|
| 40 | public readonly static Vector Zero = new Vector(new double[0]);
|
---|
| 41 |
|
---|
| 42 | public static Vector operator +(Vector a, Vector b) {
|
---|
| 43 | if (a == Zero) return b;
|
---|
| 44 | if (b == Zero) return a;
|
---|
| 45 | Debug.Assert(a.arr.Length == b.arr.Length);
|
---|
| 46 | var res = new double[a.arr.Length];
|
---|
| 47 | for (int i = 0; i < res.Length; i++)
|
---|
| 48 | res[i] = a.arr[i] + b.arr[i];
|
---|
| 49 | return new Vector(res);
|
---|
| 50 | }
|
---|
| 51 | public static Vector operator -(Vector a, Vector b) {
|
---|
| 52 | if (b == Zero) return a;
|
---|
| 53 | if (a == Zero) return -b;
|
---|
| 54 | Debug.Assert(a.arr.Length == b.arr.Length);
|
---|
| 55 | var res = new double[a.arr.Length];
|
---|
| 56 | for (int i = 0; i < res.Length; i++)
|
---|
| 57 | res[i] = a.arr[i] - b.arr[i];
|
---|
| 58 | return new Vector(res);
|
---|
| 59 | }
|
---|
| 60 | public static Vector operator -(Vector v) {
|
---|
| 61 | if (v == Zero) return Zero;
|
---|
| 62 | for (int i = 0; i < v.arr.Length; i++)
|
---|
| 63 | v.arr[i] = -v.arr[i];
|
---|
| 64 | return v;
|
---|
| 65 | }
|
---|
| 66 |
|
---|
| 67 | public static Vector operator *(double s, Vector v) {
|
---|
| 68 | if (v == Zero) return Zero;
|
---|
| 69 | if (s == 0.0) return Zero;
|
---|
| 70 | var res = new double[v.arr.Length];
|
---|
| 71 | for (int i = 0; i < res.Length; i++)
|
---|
| 72 | res[i] = s * v.arr[i];
|
---|
| 73 | return new Vector(res);
|
---|
| 74 | }
|
---|
| 75 | public static Vector operator *(Vector v, double s) {
|
---|
| 76 | return s * v;
|
---|
| 77 | }
|
---|
| 78 | public static Vector operator /(double s, Vector v) {
|
---|
| 79 | if (s == 0.0) return Zero;
|
---|
| 80 | if (v == Zero) throw new ArgumentException("Division by zero vector");
|
---|
| 81 | var res = new double[v.arr.Length];
|
---|
| 82 | for (int i = 0; i < res.Length; i++)
|
---|
| 83 | res[i] = 1.0 / v.arr[i];
|
---|
| 84 | return new Vector(res);
|
---|
| 85 | }
|
---|
| 86 | public static Vector operator /(Vector v, double s) {
|
---|
| 87 | return v * 1.0 / s;
|
---|
| 88 | }
|
---|
| 89 |
|
---|
| 90 |
|
---|
| 91 | private readonly double[] arr; // backing array;
|
---|
| 92 |
|
---|
| 93 | public Vector(double[] v) {
|
---|
| 94 | this.arr = v;
|
---|
| 95 | }
|
---|
| 96 |
|
---|
| 97 | public void CopyTo(double[] target) {
|
---|
| 98 | Debug.Assert(arr.Length <= target.Length);
|
---|
| 99 | Array.Copy(arr, target, arr.Length);
|
---|
| 100 | }
|
---|
| 101 | }
|
---|
| 102 |
|
---|
| 103 | [Item("Dynamical Systems Modelling Problem", "TODO")]
|
---|
| 104 | [Creatable(CreatableAttribute.Categories.GeneticProgrammingProblems, Priority = 900)]
|
---|
| 105 | [StorableClass]
|
---|
[15968] | 106 | public sealed class Problem : SingleObjectiveBasicProblem<MultiEncoding>, IRegressionProblem, IProblemInstanceConsumer<IRegressionProblemData>, IProblemInstanceExporter<IRegressionProblemData> {
|
---|
[15964] | 107 |
|
---|
| 108 | #region parameter names
|
---|
[15968] | 109 | private const string ProblemDataParameterName = "Data";
|
---|
| 110 | private const string TargetVariablesParameterName = "Target variables";
|
---|
| 111 | private const string FunctionSetParameterName = "Function set";
|
---|
| 112 | private const string MaximumLengthParameterName = "Size limit";
|
---|
| 113 | private const string MaximumParameterOptimizationIterationsParameterName = "Max. parameter optimization iterations";
|
---|
[15970] | 114 | private const string NumberOfLatentVariablesParameterName = "Number of latent variables";
|
---|
| 115 | private const string NumericIntegrationStepsParameterName = "Steps for numeric integration";
|
---|
[15964] | 116 | #endregion
|
---|
| 117 |
|
---|
| 118 | #region Parameter Properties
|
---|
| 119 | IParameter IDataAnalysisProblem.ProblemDataParameter { get { return ProblemDataParameter; } }
|
---|
| 120 |
|
---|
| 121 | public IValueParameter<IRegressionProblemData> ProblemDataParameter {
|
---|
| 122 | get { return (IValueParameter<IRegressionProblemData>)Parameters[ProblemDataParameterName]; }
|
---|
| 123 | }
|
---|
[15968] | 124 | public IValueParameter<ReadOnlyCheckedItemCollection<StringValue>> TargetVariablesParameter {
|
---|
| 125 | get { return (IValueParameter<ReadOnlyCheckedItemCollection<StringValue>>)Parameters[TargetVariablesParameterName]; }
|
---|
| 126 | }
|
---|
| 127 | public IValueParameter<ReadOnlyCheckedItemCollection<StringValue>> FunctionSetParameter {
|
---|
| 128 | get { return (IValueParameter<ReadOnlyCheckedItemCollection<StringValue>>)Parameters[FunctionSetParameterName]; }
|
---|
| 129 | }
|
---|
| 130 | public IFixedValueParameter<IntValue> MaximumLengthParameter {
|
---|
| 131 | get { return (IFixedValueParameter<IntValue>)Parameters[MaximumLengthParameterName]; }
|
---|
| 132 | }
|
---|
| 133 | public IFixedValueParameter<IntValue> MaximumParameterOptimizationIterationsParameter {
|
---|
| 134 | get { return (IFixedValueParameter<IntValue>)Parameters[MaximumParameterOptimizationIterationsParameterName]; }
|
---|
| 135 | }
|
---|
[15970] | 136 | public IFixedValueParameter<IntValue> NumberOfLatentVariablesParameter {
|
---|
| 137 | get { return (IFixedValueParameter<IntValue>)Parameters[NumberOfLatentVariablesParameterName]; }
|
---|
| 138 | }
|
---|
| 139 | public IFixedValueParameter<IntValue> NumericIntegrationStepsParameter {
|
---|
| 140 | get { return (IFixedValueParameter<IntValue>)Parameters[NumericIntegrationStepsParameterName]; }
|
---|
| 141 | }
|
---|
[15964] | 142 | #endregion
|
---|
| 143 |
|
---|
| 144 | #region Properties
|
---|
| 145 | public IRegressionProblemData ProblemData {
|
---|
| 146 | get { return ProblemDataParameter.Value; }
|
---|
| 147 | set { ProblemDataParameter.Value = value; }
|
---|
| 148 | }
|
---|
| 149 | IDataAnalysisProblemData IDataAnalysisProblem.ProblemData { get { return ProblemData; } }
|
---|
| 150 |
|
---|
[15968] | 151 | public ReadOnlyCheckedItemCollection<StringValue> TargetVariables {
|
---|
| 152 | get { return TargetVariablesParameter.Value; }
|
---|
| 153 | }
|
---|
| 154 |
|
---|
| 155 | public ReadOnlyCheckedItemCollection<StringValue> FunctionSet {
|
---|
| 156 | get { return FunctionSetParameter.Value; }
|
---|
| 157 | }
|
---|
| 158 |
|
---|
| 159 | public int MaximumLength {
|
---|
| 160 | get { return MaximumLengthParameter.Value.Value; }
|
---|
| 161 | }
|
---|
| 162 | public int MaximumParameterOptimizationIterations {
|
---|
| 163 | get { return MaximumParameterOptimizationIterationsParameter.Value.Value; }
|
---|
| 164 | }
|
---|
[15970] | 165 | public int NumberOfLatentVariables {
|
---|
| 166 | get { return NumberOfLatentVariablesParameter.Value.Value; }
|
---|
| 167 | }
|
---|
| 168 | public int NumericIntegrationSteps {
|
---|
| 169 | get { return NumericIntegrationStepsParameter.Value.Value; }
|
---|
| 170 | }
|
---|
| 171 |
|
---|
[15968] | 172 | #endregion
|
---|
| 173 |
|
---|
[15964] | 174 | public event EventHandler ProblemDataChanged;
|
---|
| 175 |
|
---|
| 176 | public override bool Maximization {
|
---|
| 177 | get { return false; } // we minimize NMSE
|
---|
| 178 | }
|
---|
| 179 |
|
---|
| 180 | #region item cloning and persistence
|
---|
| 181 | // persistence
|
---|
| 182 | [StorableConstructor]
|
---|
| 183 | private Problem(bool deserializing) : base(deserializing) { }
|
---|
| 184 | [StorableHook(HookType.AfterDeserialization)]
|
---|
| 185 | private void AfterDeserialization() {
|
---|
| 186 | RegisterEventHandlers();
|
---|
| 187 | }
|
---|
| 188 |
|
---|
| 189 | // cloning
|
---|
| 190 | private Problem(Problem original, Cloner cloner)
|
---|
| 191 | : base(original, cloner) {
|
---|
| 192 | RegisterEventHandlers();
|
---|
| 193 | }
|
---|
| 194 | public override IDeepCloneable Clone(Cloner cloner) { return new Problem(this, cloner); }
|
---|
| 195 | #endregion
|
---|
| 196 |
|
---|
| 197 | public Problem()
|
---|
| 198 | : base() {
|
---|
[15968] | 199 | var targetVariables = new CheckedItemCollection<StringValue>().AsReadOnly(); // HACK: it would be better to provide a new class derived from IDataAnalysisProblem
|
---|
| 200 | var functions = CreateFunctionSet();
|
---|
[15970] | 201 | Parameters.Add(new ValueParameter<IRegressionProblemData>(ProblemDataParameterName, "The data captured from the dynamical system. Use CSV import functionality to import data.", new RegressionProblemData()));
|
---|
[15968] | 202 | Parameters.Add(new ValueParameter<ReadOnlyCheckedItemCollection<StringValue>>(TargetVariablesParameterName, "Target variables (overrides setting in ProblemData)", targetVariables));
|
---|
| 203 | Parameters.Add(new ValueParameter<ReadOnlyCheckedItemCollection<StringValue>>(FunctionSetParameterName, "The list of allowed functions", functions));
|
---|
[15970] | 204 | Parameters.Add(new FixedValueParameter<IntValue>(MaximumLengthParameterName, "The maximally allowed length of each expression. Set to a small value (5 - 25). Default = 10", new IntValue(10)));
|
---|
| 205 | Parameters.Add(new FixedValueParameter<IntValue>(MaximumParameterOptimizationIterationsParameterName, "The maximum number of iterations for optimization of parameters (using L-BFGS). More iterations makes the algorithm slower, fewer iterations might prevent convergence in the optimization scheme. Default = 100", new IntValue(100)));
|
---|
| 206 | Parameters.Add(new FixedValueParameter<IntValue>(NumberOfLatentVariablesParameterName, "Latent variables (unobserved variables) allow us to produce expressions which are integrated up and can be used in other expressions. They are handled similarly to target variables in forward simulation / integration. The difference to target variables is that there are no data to which the calculated values of latent variables are compared. Set to a small value (0 .. 5) as necessary (default = 0)", new IntValue(0)));
|
---|
| 207 | Parameters.Add(new FixedValueParameter<IntValue>(NumericIntegrationStepsParameterName, "Number of steps in the numeric integration that are taken from one row to the next (set to 1 to 100). More steps makes the algorithm slower, less steps worsens the accuracy of the numeric integration scheme.", new IntValue(10)));
|
---|
[15964] | 208 |
|
---|
| 209 | RegisterEventHandlers();
|
---|
[15968] | 210 | InitAllParameters();
|
---|
[15964] | 211 | }
|
---|
| 212 |
|
---|
| 213 |
|
---|
[15968] | 214 | public override double Evaluate(Individual individual, IRandom random) {
|
---|
| 215 | var trees = individual.Values.Select(v => v.Value).OfType<ISymbolicExpressionTree>().ToArray(); // extract all trees from individual
|
---|
| 216 |
|
---|
[15964] | 217 | var problemData = ProblemData;
|
---|
| 218 | var rows = ProblemData.TrainingIndices.ToArray();
|
---|
[15968] | 219 | var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
|
---|
[15970] | 220 | var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees
|
---|
| 221 | var targetValues = new double[rows.Length, targetVars.Length];
|
---|
| 222 |
|
---|
[15968] | 223 | // collect values of all target variables
|
---|
| 224 | var colIdx = 0;
|
---|
[15970] | 225 | foreach (var targetVar in targetVars) {
|
---|
[15968] | 226 | int rowIdx = 0;
|
---|
[15970] | 227 | foreach (var value in problemData.Dataset.GetDoubleValues(targetVar, rows)) {
|
---|
[15968] | 228 | targetValues[rowIdx, colIdx] = value;
|
---|
| 229 | rowIdx++;
|
---|
| 230 | }
|
---|
| 231 | colIdx++;
|
---|
| 232 | }
|
---|
[15964] | 233 |
|
---|
| 234 | var nodeIdx = new Dictionary<ISymbolicExpressionTreeNode, int>();
|
---|
[15968] | 235 |
|
---|
| 236 | foreach (var tree in trees) {
|
---|
| 237 | foreach (var node in tree.Root.IterateNodesPrefix().Where(n => IsConstantNode(n))) {
|
---|
| 238 | nodeIdx.Add(node, nodeIdx.Count);
|
---|
| 239 | }
|
---|
[15964] | 240 | }
|
---|
| 241 |
|
---|
| 242 | var theta = nodeIdx.Select(_ => random.NextDouble() * 2.0 - 1.0).ToArray(); // init params randomly from Unif(-1,1)
|
---|
| 243 |
|
---|
| 244 | double[] optTheta = new double[0];
|
---|
| 245 | if (theta.Length > 0) {
|
---|
| 246 | alglib.minlbfgsstate state;
|
---|
| 247 | alglib.minlbfgsreport report;
|
---|
| 248 | alglib.minlbfgscreate(Math.Min(theta.Length, 5), theta, out state);
|
---|
[15968] | 249 | alglib.minlbfgssetcond(state, 0.0, 0.0, 0.0, MaximumParameterOptimizationIterations);
|
---|
[15970] | 250 | alglib.minlbfgsoptimize(state, EvaluateObjectiveAndGradient, null,
|
---|
| 251 | new object[] { trees, targetVars, problemData, nodeIdx, targetValues, rows, NumericIntegrationSteps, latentVariables }); //TODO: create a type
|
---|
[15964] | 252 | alglib.minlbfgsresults(state, out optTheta, out report);
|
---|
| 253 |
|
---|
| 254 | /*
|
---|
| 255 | *
|
---|
| 256 | * L-BFGS algorithm results
|
---|
| 257 |
|
---|
| 258 | INPUT PARAMETERS:
|
---|
| 259 | State - algorithm state
|
---|
| 260 |
|
---|
| 261 | OUTPUT PARAMETERS:
|
---|
| 262 | X - array[0..N-1], solution
|
---|
| 263 | Rep - optimization report:
|
---|
| 264 | * Rep.TerminationType completetion code:
|
---|
| 265 | * -7 gradient verification failed.
|
---|
| 266 | See MinLBFGSSetGradientCheck() for more information.
|
---|
| 267 | * -2 rounding errors prevent further improvement.
|
---|
| 268 | X contains best point found.
|
---|
| 269 | * -1 incorrect parameters were specified
|
---|
| 270 | * 1 relative function improvement is no more than
|
---|
| 271 | EpsF.
|
---|
| 272 | * 2 relative step is no more than EpsX.
|
---|
| 273 | * 4 gradient norm is no more than EpsG
|
---|
| 274 | * 5 MaxIts steps was taken
|
---|
| 275 | * 7 stopping conditions are too stringent,
|
---|
| 276 | further improvement is impossible
|
---|
| 277 | * Rep.IterationsCount contains iterations count
|
---|
| 278 | * NFEV countains number of function calculations
|
---|
| 279 | */
|
---|
| 280 | if (report.terminationtype < 0) return double.MaxValue;
|
---|
| 281 | }
|
---|
| 282 |
|
---|
| 283 | // perform evaluation for optimal theta to get quality value
|
---|
| 284 | double[] grad = new double[optTheta.Length];
|
---|
| 285 | double optQuality = double.NaN;
|
---|
[15970] | 286 | EvaluateObjectiveAndGradient(optTheta, ref optQuality, grad,
|
---|
| 287 | new object[] { trees, targetVars, problemData, nodeIdx, targetValues, rows, NumericIntegrationSteps, latentVariables });
|
---|
[15964] | 288 | if (double.IsNaN(optQuality) || double.IsInfinity(optQuality)) return 10E6; // return a large value (TODO: be consistent by using NMSE)
|
---|
[15968] | 289 |
|
---|
| 290 | individual["OptTheta"] = new DoubleArray(optTheta); // write back optimized parameters so that we can use them in the Analysis method
|
---|
[15964] | 291 | return optQuality;
|
---|
| 292 | }
|
---|
| 293 |
|
---|
| 294 | private static void EvaluateObjectiveAndGradient(double[] x, ref double f, double[] grad, object obj) {
|
---|
[15968] | 295 | var trees = (ISymbolicExpressionTree[])((object[])obj)[0];
|
---|
| 296 | var targetVariables = (string[])((object[])obj)[1];
|
---|
| 297 | var problemData = (IRegressionProblemData)((object[])obj)[2];
|
---|
| 298 | var nodeIdx = (Dictionary<ISymbolicExpressionTreeNode, int>)((object[])obj)[3];
|
---|
| 299 | var targetValues = (double[,])((object[])obj)[4];
|
---|
| 300 | var rows = (int[])((object[])obj)[5];
|
---|
[15970] | 301 | var numericIntegrationSteps = (int)((object[])obj)[6];
|
---|
| 302 | var latentVariables = (string[])((object[])obj)[7];
|
---|
[15964] | 303 |
|
---|
| 304 | var predicted = Integrate(
|
---|
[15968] | 305 | trees, // we assume trees contain expressions for the change of each target variable over time y'(t)
|
---|
[15964] | 306 | problemData.Dataset,
|
---|
| 307 | problemData.AllowedInputVariables.ToArray(),
|
---|
[15968] | 308 | targetVariables,
|
---|
[15970] | 309 | latentVariables,
|
---|
[15968] | 310 | rows,
|
---|
| 311 | nodeIdx, // TODO: is it Ok to use rows here ?
|
---|
[15970] | 312 | x, numericIntegrationSteps).ToArray();
|
---|
[15964] | 313 |
|
---|
[15968] | 314 |
|
---|
| 315 | // for normalized MSE = 1/variance(t) * MSE(t, pred)
|
---|
| 316 | var invVar = Enumerable.Range(0, targetVariables.Length)
|
---|
| 317 | .Select(c => rows.Select(row => targetValues[row, c])) // colums vectors
|
---|
| 318 | .Select(vec => vec.Variance())
|
---|
| 319 | .Select(v => 1.0 / v)
|
---|
| 320 | .ToArray();
|
---|
| 321 |
|
---|
| 322 | // objective function is NMSE
|
---|
[15964] | 323 | f = 0.0;
|
---|
| 324 | int n = predicted.Length;
|
---|
| 325 | double invN = 1.0 / n;
|
---|
| 326 | var g = Vector.Zero;
|
---|
[15968] | 327 | int r = 0;
|
---|
| 328 | foreach (var y_pred in predicted) {
|
---|
| 329 | // TODO NMSE to put the same weight on each target regardless of the value range;
|
---|
[15970] | 330 | for (int c = 0; c < y_pred.Length; c++) {
|
---|
| 331 |
|
---|
[15968] | 332 | var y_pred_f = y_pred[c].Item1;
|
---|
[15970] | 333 | var y = targetValues[r, c];
|
---|
[15964] | 334 |
|
---|
[15968] | 335 | var res = (y - y_pred_f);
|
---|
| 336 | var ressq = res * res;
|
---|
| 337 | f += ressq * invN * invVar[c];
|
---|
| 338 | g += -2.0 * res * y_pred[c].Item2 * invN * invVar[c];
|
---|
| 339 | }
|
---|
| 340 | r++;
|
---|
[15964] | 341 | }
|
---|
| 342 |
|
---|
| 343 | g.CopyTo(grad);
|
---|
| 344 | }
|
---|
| 345 |
|
---|
[15968] | 346 | public override void Analyze(Individual[] individuals, double[] qualities, ResultCollection results, IRandom random) {
|
---|
| 347 | base.Analyze(individuals, qualities, results, random);
|
---|
[15964] | 348 |
|
---|
[15968] | 349 | if (!results.ContainsKey("Prediction (training)")) {
|
---|
| 350 | results.Add(new Result("Prediction (training)", typeof(ReadOnlyItemList<DataTable>)));
|
---|
| 351 | }
|
---|
| 352 | if (!results.ContainsKey("Prediction (test)")) {
|
---|
| 353 | results.Add(new Result("Prediction (test)", typeof(ReadOnlyItemList<DataTable>)));
|
---|
| 354 | }
|
---|
| 355 | if (!results.ContainsKey("Models")) {
|
---|
| 356 | results.Add(new Result("Models", typeof(ReadOnlyItemList<ISymbolicExpressionTree>)));
|
---|
| 357 | }
|
---|
| 358 |
|
---|
| 359 | // TODO extract common functionality from Evaluate and Analyze
|
---|
| 360 | var bestIndividualAndQuality = this.GetBestIndividual(individuals, qualities);
|
---|
[15970] | 361 | var optTheta = ((DoubleArray)bestIndividualAndQuality.Item1["OptTheta"]).ToArray(); // see evaluate
|
---|
[15968] | 362 | var trees = bestIndividualAndQuality.Item1.Values.Select(v => v.Value).OfType<ISymbolicExpressionTree>().ToArray(); // extract all trees from individual
|
---|
| 363 | var nodeIdx = new Dictionary<ISymbolicExpressionTreeNode, int>();
|
---|
| 364 |
|
---|
| 365 |
|
---|
| 366 | foreach (var tree in trees) {
|
---|
| 367 | foreach (var node in tree.Root.IterateNodesPrefix().Where(n => IsConstantNode(n))) {
|
---|
| 368 | nodeIdx.Add(node, nodeIdx.Count);
|
---|
| 369 | }
|
---|
| 370 | }
|
---|
| 371 | var problemData = ProblemData;
|
---|
| 372 | var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
|
---|
[15970] | 373 | var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees
|
---|
[15968] | 374 |
|
---|
| 375 | var trainingList = new ItemList<DataTable>();
|
---|
| 376 | var trainingRows = ProblemData.TrainingIndices.ToArray();
|
---|
| 377 | var trainingPrediction = Integrate(
|
---|
| 378 | trees, // we assume trees contain expressions for the change of each target variable over time y'(t)
|
---|
| 379 | problemData.Dataset,
|
---|
| 380 | problemData.AllowedInputVariables.ToArray(),
|
---|
| 381 | targetVars,
|
---|
[15970] | 382 | latentVariables,
|
---|
[15968] | 383 | trainingRows,
|
---|
| 384 | nodeIdx,
|
---|
[15970] | 385 | optTheta,
|
---|
| 386 | NumericIntegrationSteps).ToArray();
|
---|
[15968] | 387 |
|
---|
| 388 | for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
|
---|
| 389 | var targetVar = targetVars[colIdx];
|
---|
[15970] | 390 | var trainingDataTable = new DataTable(targetVar + " prediction (training)");
|
---|
[15968] | 391 | var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, trainingRows));
|
---|
| 392 | var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, trainingPrediction.Select(arr => arr[colIdx].Item1).ToArray());
|
---|
| 393 | trainingDataTable.Rows.Add(actualValuesRow);
|
---|
| 394 | trainingDataTable.Rows.Add(predictedValuesRow);
|
---|
| 395 | trainingList.Add(trainingDataTable);
|
---|
| 396 | }
|
---|
| 397 |
|
---|
| 398 | // TODO: DRY for training and test
|
---|
| 399 | var testList = new ItemList<DataTable>();
|
---|
| 400 | var testRows = ProblemData.TestIndices.ToArray();
|
---|
| 401 | var testPrediction = Integrate(
|
---|
| 402 | trees, // we assume trees contain expressions for the change of each target variable over time y'(t)
|
---|
| 403 | problemData.Dataset,
|
---|
| 404 | problemData.AllowedInputVariables.ToArray(),
|
---|
| 405 | targetVars,
|
---|
[15970] | 406 | latentVariables,
|
---|
[15968] | 407 | testRows,
|
---|
| 408 | nodeIdx,
|
---|
[15970] | 409 | optTheta,
|
---|
| 410 | NumericIntegrationSteps).ToArray();
|
---|
[15968] | 411 |
|
---|
| 412 | for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
|
---|
| 413 | var targetVar = targetVars[colIdx];
|
---|
| 414 | var testDataTable = new DataTable(targetVar + " prediction (test)");
|
---|
| 415 | var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, testRows));
|
---|
| 416 | var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, testPrediction.Select(arr => arr[colIdx].Item1).ToArray());
|
---|
| 417 | testDataTable.Rows.Add(actualValuesRow);
|
---|
| 418 | testDataTable.Rows.Add(predictedValuesRow);
|
---|
| 419 | testList.Add(testDataTable);
|
---|
| 420 | }
|
---|
| 421 |
|
---|
| 422 | results["Prediction (training)"].Value = trainingList.AsReadOnly();
|
---|
| 423 | results["Prediction (test)"].Value = testList.AsReadOnly();
|
---|
| 424 | results["Models"].Value = new ItemList<ISymbolicExpressionTree>(trees).AsReadOnly();
|
---|
| 425 | }
|
---|
| 426 |
|
---|
| 427 |
|
---|
| 428 | #region interpretation
|
---|
| 429 | private static IEnumerable<Tuple<double, Vector>[]> Integrate(
|
---|
[15970] | 430 | ISymbolicExpressionTree[] trees, IDataset dataset, string[] inputVariables, string[] targetVariables, string[] latentVariables, IEnumerable<int> rows,
|
---|
| 431 | Dictionary<ISymbolicExpressionTreeNode, int> nodeIdx, double[] parameterValues, int numericIntegrationSteps = 100) {
|
---|
[15964] | 432 |
|
---|
[15970] | 433 | int NUM_STEPS = numericIntegrationSteps ;
|
---|
[15964] | 434 | double h = 1.0 / NUM_STEPS;
|
---|
| 435 |
|
---|
| 436 | // return first value as stored in the dataset
|
---|
[15968] | 437 | yield return targetVariables
|
---|
| 438 | .Select(targetVar => Tuple.Create(dataset.GetDoubleValue(targetVar, rows.First()), Vector.Zero))
|
---|
| 439 | .ToArray();
|
---|
| 440 |
|
---|
[15964] | 441 | // integrate forward starting with known values for the target in t0
|
---|
| 442 |
|
---|
| 443 | var variableValues = new Dictionary<string, Tuple<double, Vector>>();
|
---|
| 444 | var t0 = rows.First();
|
---|
| 445 | foreach (var varName in inputVariables) {
|
---|
| 446 | variableValues.Add(varName, Tuple.Create(dataset.GetDoubleValue(varName, t0), Vector.Zero));
|
---|
| 447 | }
|
---|
| 448 | foreach (var varName in targetVariables) {
|
---|
| 449 | variableValues.Add(varName, Tuple.Create(dataset.GetDoubleValue(varName, t0), Vector.Zero));
|
---|
| 450 | }
|
---|
[15970] | 451 | // add value entries for latent variables which are also integrated
|
---|
| 452 | foreach(var latentVar in latentVariables) {
|
---|
| 453 | variableValues.Add(latentVar, Tuple.Create(0.0, Vector.Zero)); // we don't have observations for latent variables -> assume zero as starting value
|
---|
| 454 | }
|
---|
| 455 | var calculatedVariables = targetVariables.Concat(latentVariables); // TODO: must conincide with the order of trees in the encoding
|
---|
[15964] | 456 |
|
---|
| 457 | foreach (var t in rows.Skip(1)) {
|
---|
| 458 | for (int step = 0; step < NUM_STEPS; step++) {
|
---|
| 459 | var deltaValues = new Dictionary<string, Tuple<double, Vector>>();
|
---|
[15970] | 460 | foreach (var tup in trees.Zip(calculatedVariables, Tuple.Create)) {
|
---|
[15964] | 461 | var tree = tup.Item1;
|
---|
| 462 | var targetVarName = tup.Item2;
|
---|
| 463 | // skip programRoot and startSymbol
|
---|
| 464 | var res = InterpretRec(tree.Root.GetSubtree(0).GetSubtree(0), variableValues, nodeIdx, parameterValues);
|
---|
| 465 | deltaValues.Add(targetVarName, res);
|
---|
| 466 | }
|
---|
| 467 |
|
---|
| 468 | // update variableValues for next step
|
---|
| 469 | foreach (var kvp in deltaValues) {
|
---|
| 470 | var oldVal = variableValues[kvp.Key];
|
---|
| 471 | variableValues[kvp.Key] = Tuple.Create(
|
---|
| 472 | oldVal.Item1 + h * kvp.Value.Item1,
|
---|
| 473 | oldVal.Item2 + h * kvp.Value.Item2
|
---|
| 474 | );
|
---|
| 475 | }
|
---|
| 476 | }
|
---|
| 477 |
|
---|
[15970] | 478 | // only return the target variables for calculation of errors
|
---|
[15968] | 479 | yield return targetVariables
|
---|
| 480 | .Select(targetVar => variableValues[targetVar])
|
---|
| 481 | .ToArray();
|
---|
[15964] | 482 |
|
---|
| 483 | // update for next time step
|
---|
| 484 | foreach (var varName in inputVariables) {
|
---|
| 485 | variableValues[varName] = Tuple.Create(dataset.GetDoubleValue(varName, t), Vector.Zero);
|
---|
| 486 | }
|
---|
| 487 | }
|
---|
| 488 | }
|
---|
| 489 |
|
---|
| 490 | private static Tuple<double, Vector> InterpretRec(
|
---|
| 491 | ISymbolicExpressionTreeNode node,
|
---|
| 492 | Dictionary<string, Tuple<double, Vector>> variableValues,
|
---|
| 493 | Dictionary<ISymbolicExpressionTreeNode, int> nodeIdx,
|
---|
| 494 | double[] parameterValues
|
---|
| 495 | ) {
|
---|
| 496 |
|
---|
| 497 | switch (node.Symbol.Name) {
|
---|
| 498 | case "+": {
|
---|
[15970] | 499 | var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); // TODO capture all parameters into a state type for interpretation
|
---|
[15964] | 500 | var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
|
---|
| 501 |
|
---|
| 502 | return Tuple.Create(l.Item1 + r.Item1, l.Item2 + r.Item2);
|
---|
| 503 | }
|
---|
| 504 | case "*": {
|
---|
[15968] | 505 | var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
|
---|
[15964] | 506 | var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
|
---|
| 507 |
|
---|
| 508 | return Tuple.Create(l.Item1 * r.Item1, l.Item2 * r.Item1 + l.Item1 * r.Item2);
|
---|
| 509 | }
|
---|
| 510 |
|
---|
| 511 | case "-": {
|
---|
[15968] | 512 | var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
|
---|
[15964] | 513 | var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
|
---|
| 514 |
|
---|
| 515 | return Tuple.Create(l.Item1 - r.Item1, l.Item2 - r.Item2);
|
---|
| 516 | }
|
---|
| 517 | case "%": {
|
---|
[15968] | 518 | var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
|
---|
[15964] | 519 | var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
|
---|
| 520 |
|
---|
| 521 | // protected division
|
---|
| 522 | if (r.Item1.IsAlmost(0.0)) {
|
---|
| 523 | return Tuple.Create(0.0, Vector.Zero);
|
---|
| 524 | } else {
|
---|
| 525 | return Tuple.Create(
|
---|
| 526 | l.Item1 / r.Item1,
|
---|
| 527 | l.Item1 * -1.0 / (r.Item1 * r.Item1) * r.Item2 + 1.0 / r.Item1 * l.Item2 // (f/g)' = f * (1/g)' + 1/g * f' = f * -1/g² * g' + 1/g * f'
|
---|
| 528 | );
|
---|
| 529 | }
|
---|
| 530 | }
|
---|
| 531 | default: {
|
---|
| 532 | // distinguish other cases
|
---|
| 533 | if (IsConstantNode(node)) {
|
---|
| 534 | var vArr = new double[parameterValues.Length]; // backing array for vector
|
---|
| 535 | vArr[nodeIdx[node]] = 1.0;
|
---|
| 536 | var g = new Vector(vArr);
|
---|
| 537 | return Tuple.Create(parameterValues[nodeIdx[node]], g);
|
---|
| 538 | } else {
|
---|
| 539 | // assume a variable name
|
---|
| 540 | var varName = node.Symbol.Name;
|
---|
| 541 | return variableValues[varName];
|
---|
| 542 | }
|
---|
| 543 | }
|
---|
| 544 | }
|
---|
| 545 | }
|
---|
[15968] | 546 | #endregion
|
---|
[15964] | 547 |
|
---|
| 548 | #region events
|
---|
[15968] | 549 | /*
|
---|
| 550 | * Dependencies between parameters:
|
---|
| 551 | *
|
---|
| 552 | * ProblemData
|
---|
| 553 | * |
|
---|
| 554 | * V
|
---|
[15970] | 555 | * TargetVariables FunctionSet MaximumLength NumberOfLatentVariables
|
---|
| 556 | * | | | |
|
---|
| 557 | * V V | |
|
---|
| 558 | * Grammar <---------------+-------------------
|
---|
[15968] | 559 | * |
|
---|
| 560 | * V
|
---|
| 561 | * Encoding
|
---|
| 562 | */
|
---|
[15964] | 563 | private void RegisterEventHandlers() {
|
---|
[15968] | 564 | ProblemDataParameter.ValueChanged += ProblemDataParameter_ValueChanged;
|
---|
| 565 | if (ProblemDataParameter.Value != null) ProblemDataParameter.Value.Changed += ProblemData_Changed;
|
---|
| 566 |
|
---|
| 567 | TargetVariablesParameter.ValueChanged += TargetVariablesParameter_ValueChanged;
|
---|
| 568 | if (TargetVariablesParameter.Value != null) TargetVariablesParameter.Value.CheckedItemsChanged += CheckedTargetVariablesChanged;
|
---|
| 569 |
|
---|
| 570 | FunctionSetParameter.ValueChanged += FunctionSetParameter_ValueChanged;
|
---|
| 571 | if (FunctionSetParameter.Value != null) FunctionSetParameter.Value.CheckedItemsChanged += CheckedFunctionsChanged;
|
---|
| 572 |
|
---|
| 573 | MaximumLengthParameter.Value.ValueChanged += MaximumLengthChanged;
|
---|
[15970] | 574 |
|
---|
| 575 | NumberOfLatentVariablesParameter.Value.ValueChanged += NumLatentVariablesChanged;
|
---|
[15964] | 576 | }
|
---|
| 577 |
|
---|
[15970] | 578 | private void NumLatentVariablesChanged(object sender, EventArgs e) {
|
---|
| 579 | UpdateGrammarAndEncoding();
|
---|
| 580 | }
|
---|
| 581 |
|
---|
[15968] | 582 | private void MaximumLengthChanged(object sender, EventArgs e) {
|
---|
| 583 | UpdateGrammarAndEncoding();
|
---|
| 584 | }
|
---|
| 585 |
|
---|
| 586 | private void FunctionSetParameter_ValueChanged(object sender, EventArgs e) {
|
---|
| 587 | FunctionSetParameter.Value.CheckedItemsChanged += CheckedFunctionsChanged;
|
---|
| 588 | }
|
---|
| 589 |
|
---|
| 590 | private void CheckedFunctionsChanged(object sender, CollectionItemsChangedEventArgs<StringValue> e) {
|
---|
| 591 | UpdateGrammarAndEncoding();
|
---|
| 592 | }
|
---|
| 593 |
|
---|
| 594 | private void TargetVariablesParameter_ValueChanged(object sender, EventArgs e) {
|
---|
| 595 | TargetVariablesParameter.Value.CheckedItemsChanged += CheckedTargetVariablesChanged;
|
---|
| 596 | }
|
---|
| 597 |
|
---|
| 598 | private void CheckedTargetVariablesChanged(object sender, CollectionItemsChangedEventArgs<StringValue> e) {
|
---|
| 599 | UpdateGrammarAndEncoding();
|
---|
| 600 | }
|
---|
| 601 |
|
---|
[15964] | 602 | private void ProblemDataParameter_ValueChanged(object sender, EventArgs e) {
|
---|
[15968] | 603 | ProblemDataParameter.Value.Changed += ProblemData_Changed;
|
---|
[15964] | 604 | OnProblemDataChanged();
|
---|
| 605 | OnReset();
|
---|
| 606 | }
|
---|
| 607 |
|
---|
| 608 | private void ProblemData_Changed(object sender, EventArgs e) {
|
---|
[15968] | 609 | OnProblemDataChanged();
|
---|
[15964] | 610 | OnReset();
|
---|
| 611 | }
|
---|
| 612 |
|
---|
| 613 | private void OnProblemDataChanged() {
|
---|
[15968] | 614 | UpdateTargetVariables(); // implicitly updates other dependent parameters
|
---|
[15964] | 615 |
|
---|
| 616 | var handler = ProblemDataChanged;
|
---|
| 617 | if (handler != null) handler(this, EventArgs.Empty);
|
---|
| 618 | }
|
---|
| 619 |
|
---|
[15968] | 620 | #endregion
|
---|
| 621 |
|
---|
| 622 | #region helper
|
---|
| 623 |
|
---|
| 624 | private void InitAllParameters() {
|
---|
| 625 | UpdateTargetVariables(); // implicitly updates the grammar and the encoding
|
---|
| 626 | }
|
---|
| 627 |
|
---|
| 628 | private ReadOnlyCheckedItemCollection<StringValue> CreateFunctionSet() {
|
---|
| 629 | var l = new CheckedItemCollection<StringValue>();
|
---|
| 630 | l.Add(new StringValue("+").AsReadOnly());
|
---|
| 631 | l.Add(new StringValue("*").AsReadOnly());
|
---|
| 632 | l.Add(new StringValue("%").AsReadOnly());
|
---|
| 633 | l.Add(new StringValue("-").AsReadOnly());
|
---|
| 634 | return l.AsReadOnly();
|
---|
| 635 | }
|
---|
| 636 |
|
---|
| 637 | private static bool IsConstantNode(ISymbolicExpressionTreeNode n) {
|
---|
| 638 | return n.Symbol.Name.StartsWith("θ");
|
---|
| 639 | }
|
---|
[15970] | 640 | private static bool IsLatentVariableNode(ISymbolicExpressionTreeNode n) {
|
---|
| 641 | return n.Symbol.Name.StartsWith("λ");
|
---|
| 642 | }
|
---|
[15968] | 643 |
|
---|
| 644 |
|
---|
| 645 | private void UpdateTargetVariables() {
|
---|
| 646 | var currentlySelectedVariables = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
|
---|
| 647 |
|
---|
| 648 | var newVariablesList = new CheckedItemCollection<StringValue>(ProblemData.Dataset.VariableNames.Select(str => new StringValue(str).AsReadOnly()).ToArray()).AsReadOnly();
|
---|
| 649 | var matchingItems = newVariablesList.Where(item => currentlySelectedVariables.Contains(item.Value)).ToArray();
|
---|
| 650 | foreach (var matchingItem in matchingItems) {
|
---|
| 651 | newVariablesList.SetItemCheckedState(matchingItem, true);
|
---|
| 652 | }
|
---|
| 653 | TargetVariablesParameter.Value = newVariablesList;
|
---|
| 654 | }
|
---|
| 655 |
|
---|
| 656 | private void UpdateGrammarAndEncoding() {
|
---|
| 657 | var encoding = new MultiEncoding();
|
---|
| 658 | var g = CreateGrammar();
|
---|
| 659 | foreach (var targetVar in TargetVariables.CheckedItems) {
|
---|
[15970] | 660 | encoding = encoding.Add(new SymbolicExpressionTreeEncoding(targetVar + "_tree", g, MaximumLength, MaximumLength)); // only limit by length
|
---|
[15968] | 661 | }
|
---|
[15970] | 662 | for (int i = 1; i <= NumberOfLatentVariables; i++) {
|
---|
| 663 | encoding = encoding.Add(new SymbolicExpressionTreeEncoding("λ" + i + "_tree", g, MaximumLength, MaximumLength));
|
---|
| 664 | }
|
---|
[15968] | 665 | Encoding = encoding;
|
---|
| 666 | }
|
---|
| 667 |
|
---|
| 668 | private ISymbolicExpressionGrammar CreateGrammar() {
|
---|
[15964] | 669 | // whenever ProblemData is changed we create a new grammar with the necessary symbols
|
---|
| 670 | var g = new SimpleSymbolicExpressionGrammar();
|
---|
[15968] | 671 | g.AddSymbols(FunctionSet.CheckedItems.Select(i => i.Value).ToArray(), 2, 2);
|
---|
[15964] | 672 |
|
---|
| 673 | // TODO
|
---|
| 674 | //g.AddSymbols(new[] {
|
---|
| 675 | // "exp",
|
---|
| 676 | // "log", // log( <expr> ) // TODO: init a theta to ensure the value is always positive
|
---|
| 677 | // "exp_minus" // exp((-1) * <expr>
|
---|
| 678 | //}, 1, 1);
|
---|
| 679 |
|
---|
[15968] | 680 | foreach (var variableName in ProblemData.AllowedInputVariables.Union(TargetVariables.CheckedItems.Select(i => i.Value)))
|
---|
[15964] | 681 | g.AddTerminalSymbol(variableName);
|
---|
| 682 |
|
---|
| 683 | // generate symbols for numeric parameters for which the value is optimized using AutoDiff
|
---|
| 684 | // we generate multiple symbols to balance the probability for selecting a numeric parameter in the generation of random trees
|
---|
| 685 | var numericConstantsFactor = 2.0;
|
---|
[15968] | 686 | for (int i = 0; i < numericConstantsFactor * (ProblemData.AllowedInputVariables.Count() + TargetVariables.CheckedItems.Count()); i++) {
|
---|
[15964] | 687 | g.AddTerminalSymbol("θ" + i); // numeric parameter for which the value is optimized using AutoDiff
|
---|
| 688 | }
|
---|
[15970] | 689 |
|
---|
| 690 | // generate symbols for latent variables
|
---|
| 691 | for (int i = 1; i <= NumberOfLatentVariables; i++) {
|
---|
| 692 | g.AddTerminalSymbol("λ" + i); // numeric parameter for which the value is optimized using AutoDiff
|
---|
| 693 | }
|
---|
| 694 |
|
---|
[15968] | 695 | return g;
|
---|
[15964] | 696 | }
|
---|
[15968] | 697 |
|
---|
[15964] | 698 | #endregion
|
---|
| 699 |
|
---|
| 700 | #region Import & Export
|
---|
| 701 | public void Load(IRegressionProblemData data) {
|
---|
| 702 | Name = data.Name;
|
---|
| 703 | Description = data.Description;
|
---|
| 704 | ProblemData = data;
|
---|
| 705 | }
|
---|
| 706 |
|
---|
| 707 | public IRegressionProblemData Export() {
|
---|
| 708 | return ProblemData;
|
---|
| 709 | }
|
---|
| 710 | #endregion
|
---|
| 711 |
|
---|
| 712 | }
|
---|
| 713 | }
|
---|