#region License Information /* HeuristicLab * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using HeuristicLab.Analysis; using HeuristicLab.Collections; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; using HeuristicLab.Optimization; using HeuristicLab.Parameters; using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; using HeuristicLab.Problems.DataAnalysis; using HeuristicLab.Problems.DataAnalysis.Symbolic; using HeuristicLab.Problems.Instances; using Variable = HeuristicLab.Problems.DataAnalysis.Symbolic.Variable; namespace HeuristicLab.Problems.DynamicalSystemsModelling { public class Vector { public readonly static Vector Zero = new Vector(new double[0]); public static Vector operator +(Vector a, Vector b) { if (a == Zero) return b; if (b == Zero) return a; Debug.Assert(a.arr.Length == b.arr.Length); var res = new double[a.arr.Length]; for (int i = 0; i < res.Length; i++) res[i] = a.arr[i] + b.arr[i]; return new Vector(res); } public static Vector operator -(Vector a, Vector b) { if (b == Zero) return a; if (a == Zero) return -b; Debug.Assert(a.arr.Length == b.arr.Length); var res = new double[a.arr.Length]; for (int i = 0; i < res.Length; i++) res[i] = a.arr[i] - b.arr[i]; return new Vector(res); } public static Vector operator -(Vector v) { if (v == Zero) return Zero; for (int i = 0; i < v.arr.Length; i++) v.arr[i] = -v.arr[i]; return v; } public static Vector operator *(double s, Vector v) { if (v == Zero) return Zero; if (s == 0.0) return Zero; var res = new double[v.arr.Length]; for (int i = 0; i < res.Length; i++) res[i] = s * v.arr[i]; return new Vector(res); } public static Vector operator *(Vector v, double s) { return s * v; } public static Vector operator *(Vector u, Vector v) { if (v == Zero) return Zero; if (u == Zero) return Zero; var res = new double[v.arr.Length]; for (int i = 0; i < res.Length; i++) res[i] = u.arr[i] * v.arr[i]; return new Vector(res); } public static Vector operator /(double s, Vector v) { if (s == 0.0) return Zero; if (v == Zero) throw new ArgumentException("Division by zero vector"); var res = new double[v.arr.Length]; for (int i = 0; i < res.Length; i++) res[i] = 1.0 / v.arr[i]; return new Vector(res); } public static Vector operator /(Vector v, double s) { return v * 1.0 / s; } public static Vector Sin(Vector s) { var res = new double[s.arr.Length]; for (int i = 0; i < res.Length; i++) res[i] = Math.Sin(s.arr[i]); return new Vector(res); } public static Vector Cos(Vector s) { var res = new double[s.arr.Length]; for (int i = 0; i < res.Length; i++) res[i] = Math.Cos(s.arr[i]); return new Vector(res); } private readonly double[] arr; // backing array; public Vector(double[] v) { this.arr = v; } public void CopyTo(double[] target) { Debug.Assert(arr.Length <= target.Length); Array.Copy(arr, target, arr.Length); } } [Item("Dynamical Systems Modelling Problem", "TODO")] [Creatable(CreatableAttribute.Categories.GeneticProgrammingProblems, Priority = 900)] [StorableClass] public sealed class Problem : SingleObjectiveBasicProblem, IRegressionProblem, IProblemInstanceConsumer, IProblemInstanceExporter { #region parameter names private const string ProblemDataParameterName = "Data"; private const string TargetVariablesParameterName = "Target variables"; private const string FunctionSetParameterName = "Function set"; private const string MaximumLengthParameterName = "Size limit"; private const string MaximumParameterOptimizationIterationsParameterName = "Max. parameter optimization iterations"; private const string NumberOfLatentVariablesParameterName = "Number of latent variables"; private const string NumericIntegrationStepsParameterName = "Steps for numeric integration"; private const string TrainingEpisodesParameterName = "Training episodes"; private const string OptimizeParametersForEpisodesParameterName = "Optimize parameters for episodes"; #endregion #region Parameter Properties IParameter IDataAnalysisProblem.ProblemDataParameter { get { return ProblemDataParameter; } } public IValueParameter ProblemDataParameter { get { return (IValueParameter)Parameters[ProblemDataParameterName]; } } public IValueParameter> TargetVariablesParameter { get { return (IValueParameter>)Parameters[TargetVariablesParameterName]; } } public IValueParameter> FunctionSetParameter { get { return (IValueParameter>)Parameters[FunctionSetParameterName]; } } public IFixedValueParameter MaximumLengthParameter { get { return (IFixedValueParameter)Parameters[MaximumLengthParameterName]; } } public IFixedValueParameter MaximumParameterOptimizationIterationsParameter { get { return (IFixedValueParameter)Parameters[MaximumParameterOptimizationIterationsParameterName]; } } public IFixedValueParameter NumberOfLatentVariablesParameter { get { return (IFixedValueParameter)Parameters[NumberOfLatentVariablesParameterName]; } } public IFixedValueParameter NumericIntegrationStepsParameter { get { return (IFixedValueParameter)Parameters[NumericIntegrationStepsParameterName]; } } public IValueParameter> TrainingEpisodesParameter { get { return (IValueParameter>)Parameters[TrainingEpisodesParameterName]; } } public IFixedValueParameter OptimizeParametersForEpisodesParameter { get { return (IFixedValueParameter)Parameters[OptimizeParametersForEpisodesParameterName]; } } #endregion #region Properties public IRegressionProblemData ProblemData { get { return ProblemDataParameter.Value; } set { ProblemDataParameter.Value = value; } } IDataAnalysisProblemData IDataAnalysisProblem.ProblemData { get { return ProblemData; } } public ReadOnlyCheckedItemCollection TargetVariables { get { return TargetVariablesParameter.Value; } } public ReadOnlyCheckedItemCollection FunctionSet { get { return FunctionSetParameter.Value; } } public int MaximumLength { get { return MaximumLengthParameter.Value.Value; } } public int MaximumParameterOptimizationIterations { get { return MaximumParameterOptimizationIterationsParameter.Value.Value; } } public int NumberOfLatentVariables { get { return NumberOfLatentVariablesParameter.Value.Value; } } public int NumericIntegrationSteps { get { return NumericIntegrationStepsParameter.Value.Value; } } public IEnumerable TrainingEpisodes { get { return TrainingEpisodesParameter.Value; } } public bool OptimizeParametersForEpisodes { get { return OptimizeParametersForEpisodesParameter.Value.Value; } } #endregion public event EventHandler ProblemDataChanged; public override bool Maximization { get { return false; } // we minimize NMSE } #region item cloning and persistence // persistence [StorableConstructor] private Problem(bool deserializing) : base(deserializing) { } [StorableHook(HookType.AfterDeserialization)] private void AfterDeserialization() { if (!Parameters.ContainsKey(OptimizeParametersForEpisodesParameterName)) { Parameters.Add(new FixedValueParameter(OptimizeParametersForEpisodesParameterName, "Flag to select if parameters should be optimized globally or for each episode individually.", new BoolValue(false))); } RegisterEventHandlers(); } // cloning private Problem(Problem original, Cloner cloner) : base(original, cloner) { RegisterEventHandlers(); } public override IDeepCloneable Clone(Cloner cloner) { return new Problem(this, cloner); } #endregion public Problem() : base() { var targetVariables = new CheckedItemCollection().AsReadOnly(); // HACK: it would be better to provide a new class derived from IDataAnalysisProblem var functions = CreateFunctionSet(); Parameters.Add(new ValueParameter(ProblemDataParameterName, "The data captured from the dynamical system. Use CSV import functionality to import data.", new RegressionProblemData())); Parameters.Add(new ValueParameter>(TargetVariablesParameterName, "Target variables (overrides setting in ProblemData)", targetVariables)); Parameters.Add(new ValueParameter>(FunctionSetParameterName, "The list of allowed functions", functions)); Parameters.Add(new FixedValueParameter(MaximumLengthParameterName, "The maximally allowed length of each expression. Set to a small value (5 - 25). Default = 10", new IntValue(10))); Parameters.Add(new FixedValueParameter(MaximumParameterOptimizationIterationsParameterName, "The maximum number of iterations for optimization of parameters (using L-BFGS). More iterations makes the algorithm slower, fewer iterations might prevent convergence in the optimization scheme. Default = 100", new IntValue(100))); Parameters.Add(new FixedValueParameter(NumberOfLatentVariablesParameterName, "Latent variables (unobserved variables) allow us to produce expressions which are integrated up and can be used in other expressions. They are handled similarly to target variables in forward simulation / integration. The difference to target variables is that there are no data to which the calculated values of latent variables are compared. Set to a small value (0 .. 5) as necessary (default = 0)", new IntValue(0))); Parameters.Add(new FixedValueParameter(NumericIntegrationStepsParameterName, "Number of steps in the numeric integration that are taken from one row to the next (set to 1 to 100). More steps makes the algorithm slower, less steps worsens the accuracy of the numeric integration scheme.", new IntValue(10))); Parameters.Add(new ValueParameter>(TrainingEpisodesParameterName, "A list of ranges that should be used for training, each range represents an independent episode. This overrides the TrainingSet parameter in ProblemData.", new ItemList())); Parameters.Add(new FixedValueParameter(OptimizeParametersForEpisodesParameterName, "Flag to select if parameters should be optimized globally or for each episode individually.", new BoolValue(false))); RegisterEventHandlers(); InitAllParameters(); // TODO: do not clear selection of target variables when the input variables are changed (keep selected target variables) // TODO: UI hangs when selecting / deselecting input variables because the encoding is updated on each item // TODO: use training range as default training episode } public override double Evaluate(Individual individual, IRandom random) { var trees = individual.Values.Select(v => v.Value).OfType().ToArray(); // extract all trees from individual if (OptimizeParametersForEpisodes) { int eIdx = 0; double totalNMSE = 0.0; int totalSize = 0; foreach (var episode in TrainingEpisodes) { double[] optTheta; double nmse; OptimizeForEpisodes(trees, random, new[] { episode }, out optTheta, out nmse); individual["OptTheta_" + eIdx] = new DoubleArray(optTheta); // write back optimized parameters so that we can use them in the Analysis method eIdx++; totalNMSE += nmse * episode.Size; totalSize += episode.Size; } return totalNMSE / totalSize; } else { double[] optTheta; double nmse; OptimizeForEpisodes(trees, random, TrainingEpisodes, out optTheta, out nmse); individual["OptTheta"] = new DoubleArray(optTheta); // write back optimized parameters so that we can use them in the Analysis method return nmse; } } private void OptimizeForEpisodes(ISymbolicExpressionTree[] trees, IRandom random, IEnumerable episodes, out double[] optTheta, out double nmse) { var rows = episodes.SelectMany(e => Enumerable.Range(e.Start, e.End - e.Start)).ToArray(); var problemData = ProblemData; var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray(); var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees var targetValues = new double[rows.Length, targetVars.Length]; // collect values of all target variables var colIdx = 0; foreach (var targetVar in targetVars) { int rowIdx = 0; foreach (var value in problemData.Dataset.GetDoubleValues(targetVar, rows)) { targetValues[rowIdx, colIdx] = value; rowIdx++; } colIdx++; } var nodeIdx = new Dictionary(); foreach (var tree in trees) { foreach (var node in tree.Root.IterateNodesPrefix().Where(n => IsConstantNode(n))) { nodeIdx.Add(node, nodeIdx.Count); } } var theta = nodeIdx.Select(_ => random.NextDouble() * 2.0 - 1.0).ToArray(); // init params randomly from Unif(-1,1) optTheta = new double[0]; if (theta.Length > 0) { alglib.minlbfgsstate state; alglib.minlbfgsreport report; alglib.minlbfgscreate(Math.Min(theta.Length, 5), theta, out state); alglib.minlbfgssetcond(state, 0.0, 0.0, 0.0, MaximumParameterOptimizationIterations); alglib.minlbfgsoptimize(state, EvaluateObjectiveAndGradient, null, new object[] { trees, targetVars, problemData, nodeIdx, targetValues, episodes.ToArray(), NumericIntegrationSteps, latentVariables }); //TODO: create a type alglib.minlbfgsresults(state, out optTheta, out report); /* * * L-BFGS algorithm results INPUT PARAMETERS: State - algorithm state OUTPUT PARAMETERS: X - array[0..N-1], solution Rep - optimization report: * Rep.TerminationType completetion code: * -7 gradient verification failed. See MinLBFGSSetGradientCheck() for more information. * -2 rounding errors prevent further improvement. X contains best point found. * -1 incorrect parameters were specified * 1 relative function improvement is no more than EpsF. * 2 relative step is no more than EpsX. * 4 gradient norm is no more than EpsG * 5 MaxIts steps was taken * 7 stopping conditions are too stringent, further improvement is impossible * Rep.IterationsCount contains iterations count * NFEV countains number of function calculations */ if (report.terminationtype < 0) { nmse = 10E6; return; } } // perform evaluation for optimal theta to get quality value double[] grad = new double[optTheta.Length]; nmse = double.NaN; EvaluateObjectiveAndGradient(optTheta, ref nmse, grad, new object[] { trees, targetVars, problemData, nodeIdx, targetValues, episodes.ToArray(), NumericIntegrationSteps, latentVariables }); if (double.IsNaN(nmse) || double.IsInfinity(nmse)) { nmse = 10E6; return; } // return a large value (TODO: be consistent by using NMSE) } private static void EvaluateObjectiveAndGradient(double[] x, ref double f, double[] grad, object obj) { var trees = (ISymbolicExpressionTree[])((object[])obj)[0]; var targetVariables = (string[])((object[])obj)[1]; var problemData = (IRegressionProblemData)((object[])obj)[2]; var nodeIdx = (Dictionary)((object[])obj)[3]; var targetValues = (double[,])((object[])obj)[4]; var episodes = (IntRange[])((object[])obj)[5]; var numericIntegrationSteps = (int)((object[])obj)[6]; var latentVariables = (string[])((object[])obj)[7]; var predicted = Integrate( trees, // we assume trees contain expressions for the change of each target variable over time y'(t) problemData.Dataset, problemData.AllowedInputVariables.ToArray(), targetVariables, latentVariables, episodes, nodeIdx, x, numericIntegrationSteps).ToArray(); // for normalized MSE = 1/variance(t) * MSE(t, pred) // TODO: Perf. (by standardization of target variables before evaluation of all trees) var invVar = Enumerable.Range(0, targetVariables.Length) .Select(c => Enumerable.Range(0, targetValues.GetLength(0)).Select(row => targetValues[row, c])) // column vectors .Select(vec => vec.Variance()) .Select(v => 1.0 / v) .ToArray(); // objective function is NMSE f = 0.0; int n = predicted.Length; double invN = 1.0 / n; var g = Vector.Zero; int r = 0; foreach (var y_pred in predicted) { for (int c = 0; c < y_pred.Length; c++) { var y_pred_f = y_pred[c].Item1; var y = targetValues[r, c]; var res = (y - y_pred_f); var ressq = res * res; f += ressq * invN * invVar[c]; g += -2.0 * res * y_pred[c].Item2 * invN * invVar[c]; } r++; } g.CopyTo(grad); } public override void Analyze(Individual[] individuals, double[] qualities, ResultCollection results, IRandom random) { base.Analyze(individuals, qualities, results, random); if (!results.ContainsKey("Prediction (training)")) { results.Add(new Result("Prediction (training)", typeof(ReadOnlyItemList))); } if (!results.ContainsKey("Prediction (test)")) { results.Add(new Result("Prediction (test)", typeof(ReadOnlyItemList))); } if (!results.ContainsKey("Models")) { results.Add(new Result("Models", typeof(VariableCollection))); } var bestIndividualAndQuality = this.GetBestIndividual(individuals, qualities); var trees = bestIndividualAndQuality.Item1.Values.Select(v => v.Value).OfType().ToArray(); // extract all trees from individual // TODO extract common functionality from Evaluate and Analyze var nodeIdx = new Dictionary(); foreach (var tree in trees) { foreach (var node in tree.Root.IterateNodesPrefix().Where(n => IsConstantNode(n))) { nodeIdx.Add(node, nodeIdx.Count); } } var problemData = ProblemData; var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray(); var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees var trainingList = new ItemList(); if (OptimizeParametersForEpisodes) { var eIdx = 0; var trainingPredictions = new List[][]>(); foreach (var episode in TrainingEpisodes) { var episodes = new[] { episode }; var optTheta = ((DoubleArray)bestIndividualAndQuality.Item1["OptTheta_" + eIdx]).ToArray(); // see evaluate var trainingPrediction = Integrate( trees, // we assume trees contain expressions for the change of each target variable over time y'(t) problemData.Dataset, problemData.AllowedInputVariables.ToArray(), targetVars, latentVariables, episodes, nodeIdx, optTheta, NumericIntegrationSteps).ToArray(); trainingPredictions.Add(trainingPrediction); eIdx++; } // only for actual target values var trainingRows = TrainingEpisodes.SelectMany(e => Enumerable.Range(e.Start, e.End - e.Start)); for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) { var targetVar = targetVars[colIdx]; var trainingDataTable = new DataTable(targetVar + " prediction (training)"); var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, trainingRows)); var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, trainingPredictions.SelectMany(arr => arr.Select(row => row[colIdx].Item1)).ToArray()); trainingDataTable.Rows.Add(actualValuesRow); trainingDataTable.Rows.Add(predictedValuesRow); trainingList.Add(trainingDataTable); } results["Prediction (training)"].Value = trainingList.AsReadOnly(); var models = new VariableCollection(); foreach (var tup in targetVars.Zip(trees, Tuple.Create)) { var targetVarName = tup.Item1; var tree = tup.Item2; var origTreeVar = new HeuristicLab.Core.Variable(targetVarName + "(original)"); origTreeVar.Value = (ISymbolicExpressionTree)tree.Clone(); models.Add(origTreeVar); } results["Models"].Value = models; } else { var optTheta = ((DoubleArray)bestIndividualAndQuality.Item1["OptTheta"]).ToArray(); // see evaluate var trainingPrediction = Integrate( trees, // we assume trees contain expressions for the change of each target variable over time y'(t) problemData.Dataset, problemData.AllowedInputVariables.ToArray(), targetVars, latentVariables, TrainingEpisodes, nodeIdx, optTheta, NumericIntegrationSteps).ToArray(); // only for actual target values var trainingRows = TrainingEpisodes.SelectMany(e => Enumerable.Range(e.Start, e.End - e.Start)); for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) { var targetVar = targetVars[colIdx]; var trainingDataTable = new DataTable(targetVar + " prediction (training)"); var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, trainingRows)); var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, trainingPrediction.Select(arr => arr[colIdx].Item1).ToArray()); trainingDataTable.Rows.Add(actualValuesRow); trainingDataTable.Rows.Add(predictedValuesRow); trainingList.Add(trainingDataTable); } // TODO: DRY for training and test var testList = new ItemList(); var testRows = ProblemData.TestIndices.ToArray(); var testPrediction = Integrate( trees, // we assume trees contain expressions for the change of each target variable over time y'(t) problemData.Dataset, problemData.AllowedInputVariables.ToArray(), targetVars, latentVariables, new IntRange[] { ProblemData.TestPartition }, nodeIdx, optTheta, NumericIntegrationSteps).ToArray(); for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) { var targetVar = targetVars[colIdx]; var testDataTable = new DataTable(targetVar + " prediction (test)"); var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, testRows)); var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, testPrediction.Select(arr => arr[colIdx].Item1).ToArray()); testDataTable.Rows.Add(actualValuesRow); testDataTable.Rows.Add(predictedValuesRow); testList.Add(testDataTable); } results["Prediction (training)"].Value = trainingList.AsReadOnly(); results["Prediction (test)"].Value = testList.AsReadOnly(); #region simplification of models // TODO the dependency of HeuristicLab.Problems.DataAnalysis.Symbolic is not ideal var models = new VariableCollection(); // to store target var names and original version of tree foreach (var tup in targetVars.Zip(trees, Tuple.Create)) { var targetVarName = tup.Item1; var tree = tup.Item2; // when we reference HeuristicLab.Problems.DataAnalysis.Symbolic we can translate symbols int nextParIdx = 0; var shownTree = new SymbolicExpressionTree(TranslateTreeNode(tree.Root, optTheta, ref nextParIdx)); // var shownTree = (SymbolicExpressionTree)tree.Clone(); // var constantsNodeOrig = tree.IterateNodesPrefix().Where(IsConstantNode); // var constantsNodeShown = shownTree.IterateNodesPrefix().Where(IsConstantNode); // // foreach (var n in constantsNodeOrig.Zip(constantsNodeShown, (original, shown) => new { original, shown })) { // double constantsVal = optTheta[nodeIdx[n.original]]; // // ConstantTreeNode replacementNode = new ConstantTreeNode(new Constant()) { Value = constantsVal }; // // var parentNode = n.shown.Parent; // int replacementIndex = parentNode.IndexOfSubtree(n.shown); // parentNode.RemoveSubtree(replacementIndex); // parentNode.InsertSubtree(replacementIndex, replacementNode); // } var origTreeVar = new HeuristicLab.Core.Variable(targetVarName + "(original)"); origTreeVar.Value = (ISymbolicExpressionTree)tree.Clone(); models.Add(origTreeVar); var simplifiedTreeVar = new HeuristicLab.Core.Variable(targetVarName + "(simplified)"); simplifiedTreeVar.Value = TreeSimplifier.Simplify(shownTree); models.Add(simplifiedTreeVar); } results["Models"].Value = models; #endregion } } private ISymbolicExpressionTreeNode TranslateTreeNode(ISymbolicExpressionTreeNode n, double[] parameterValues, ref int nextParIdx) { ISymbolicExpressionTreeNode translatedNode = null; if (n.Symbol is StartSymbol) { translatedNode = new StartSymbol().CreateTreeNode(); } else if (n.Symbol is ProgramRootSymbol) { translatedNode = new ProgramRootSymbol().CreateTreeNode(); } else if (n.Symbol.Name == "+") { translatedNode = new Addition().CreateTreeNode(); } else if (n.Symbol.Name == "-") { translatedNode = new Subtraction().CreateTreeNode(); } else if (n.Symbol.Name == "*") { translatedNode = new Multiplication().CreateTreeNode(); } else if (n.Symbol.Name == "%") { translatedNode = new Division().CreateTreeNode(); } else if (n.Symbol.Name == "sin") { translatedNode = new Sine().CreateTreeNode(); } else if (n.Symbol.Name == "cos") { translatedNode = new Cosine().CreateTreeNode(); } else if (IsConstantNode(n)) { var constNode = (ConstantTreeNode)new Constant().CreateTreeNode(); constNode.Value = parameterValues[nextParIdx]; nextParIdx++; translatedNode = constNode; } else { // assume a variable name var varName = n.Symbol.Name; var varNode = (VariableTreeNode)new Variable().CreateTreeNode(); varNode.Weight = 1.0; varNode.VariableName = varName; translatedNode = varNode; } foreach (var child in n.Subtrees) { translatedNode.AddSubtree(TranslateTreeNode(child, parameterValues, ref nextParIdx)); } return translatedNode; } #region interpretation private static IEnumerable[]> Integrate( ISymbolicExpressionTree[] trees, IDataset dataset, string[] inputVariables, string[] targetVariables, string[] latentVariables, IEnumerable episodes, Dictionary nodeIdx, double[] parameterValues, int numericIntegrationSteps = 100) { int NUM_STEPS = numericIntegrationSteps; double h = 1.0 / NUM_STEPS; foreach (var episode in episodes) { var rows = Enumerable.Range(episode.Start, episode.End - episode.Start); // return first value as stored in the dataset yield return targetVariables .Select(targetVar => Tuple.Create(dataset.GetDoubleValue(targetVar, rows.First()), Vector.Zero)) .ToArray(); // integrate forward starting with known values for the target in t0 var variableValues = new Dictionary>(); var t0 = rows.First(); foreach (var varName in inputVariables) { variableValues.Add(varName, Tuple.Create(dataset.GetDoubleValue(varName, t0), Vector.Zero)); } foreach (var varName in targetVariables) { variableValues.Add(varName, Tuple.Create(dataset.GetDoubleValue(varName, t0), Vector.Zero)); } // add value entries for latent variables which are also integrated foreach (var latentVar in latentVariables) { variableValues.Add(latentVar, Tuple.Create(0.0, Vector.Zero)); // we don't have observations for latent variables -> assume zero as starting value } var calculatedVariables = targetVariables.Concat(latentVariables); // TODO: must conincide with the order of trees in the encoding foreach (var t in rows.Skip(1)) { for (int step = 0; step < NUM_STEPS; step++) { var deltaValues = new Dictionary>(); foreach (var tup in trees.Zip(calculatedVariables, Tuple.Create)) { var tree = tup.Item1; var targetVarName = tup.Item2; // skip programRoot and startSymbol var res = InterpretRec(tree.Root.GetSubtree(0).GetSubtree(0), variableValues, nodeIdx, parameterValues); deltaValues.Add(targetVarName, res); } // update variableValues for next step foreach (var kvp in deltaValues) { var oldVal = variableValues[kvp.Key]; variableValues[kvp.Key] = Tuple.Create( oldVal.Item1 + h * kvp.Value.Item1, oldVal.Item2 + h * kvp.Value.Item2 ); } } // only return the target variables for calculation of errors yield return targetVariables .Select(targetVar => variableValues[targetVar]) .ToArray(); // update for next time step foreach (var varName in inputVariables) { variableValues[varName] = Tuple.Create(dataset.GetDoubleValue(varName, t), Vector.Zero); } } } } private static Tuple InterpretRec( ISymbolicExpressionTreeNode node, Dictionary> variableValues, Dictionary nodeIdx, double[] parameterValues ) { switch (node.Symbol.Name) { case "+": { var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); // TODO capture all parameters into a state type for interpretation var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues); return Tuple.Create(l.Item1 + r.Item1, l.Item2 + r.Item2); } case "*": { var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues); return Tuple.Create(l.Item1 * r.Item1, l.Item2 * r.Item1 + l.Item1 * r.Item2); } case "-": { var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues); return Tuple.Create(l.Item1 - r.Item1, l.Item2 - r.Item2); } case "%": { var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues); // protected division if (r.Item1.IsAlmost(0.0)) { return Tuple.Create(0.0, Vector.Zero); } else { return Tuple.Create( l.Item1 / r.Item1, l.Item1 * -1.0 / (r.Item1 * r.Item1) * r.Item2 + 1.0 / r.Item1 * l.Item2 // (f/g)' = f * (1/g)' + 1/g * f' = f * -1/g² * g' + 1/g * f' ); } } case "sin": { var x = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); return Tuple.Create( Math.Sin(x.Item1), Vector.Cos(x.Item2) * x.Item2 ); } case "cos": { var x = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); return Tuple.Create( Math.Cos(x.Item1), -Vector.Sin(x.Item2) * x.Item2 ); } default: { // distinguish other cases if (IsConstantNode(node)) { var vArr = new double[parameterValues.Length]; // backing array for vector vArr[nodeIdx[node]] = 1.0; var g = new Vector(vArr); return Tuple.Create(parameterValues[nodeIdx[node]], g); } else { // assume a variable name var varName = node.Symbol.Name; return variableValues[varName]; } } } } #endregion #region events /* * Dependencies between parameters: * * ProblemData * | * V * TargetVariables FunctionSet MaximumLength NumberOfLatentVariables * | | | | * V V | | * Grammar <---------------+------------------- * | * V * Encoding */ private void RegisterEventHandlers() { ProblemDataParameter.ValueChanged += ProblemDataParameter_ValueChanged; if (ProblemDataParameter.Value != null) ProblemDataParameter.Value.Changed += ProblemData_Changed; TargetVariablesParameter.ValueChanged += TargetVariablesParameter_ValueChanged; if (TargetVariablesParameter.Value != null) TargetVariablesParameter.Value.CheckedItemsChanged += CheckedTargetVariablesChanged; FunctionSetParameter.ValueChanged += FunctionSetParameter_ValueChanged; if (FunctionSetParameter.Value != null) FunctionSetParameter.Value.CheckedItemsChanged += CheckedFunctionsChanged; MaximumLengthParameter.Value.ValueChanged += MaximumLengthChanged; NumberOfLatentVariablesParameter.Value.ValueChanged += NumLatentVariablesChanged; } private void NumLatentVariablesChanged(object sender, EventArgs e) { UpdateGrammarAndEncoding(); } private void MaximumLengthChanged(object sender, EventArgs e) { UpdateGrammarAndEncoding(); } private void FunctionSetParameter_ValueChanged(object sender, EventArgs e) { FunctionSetParameter.Value.CheckedItemsChanged += CheckedFunctionsChanged; } private void CheckedFunctionsChanged(object sender, CollectionItemsChangedEventArgs e) { UpdateGrammarAndEncoding(); } private void TargetVariablesParameter_ValueChanged(object sender, EventArgs e) { TargetVariablesParameter.Value.CheckedItemsChanged += CheckedTargetVariablesChanged; } private void CheckedTargetVariablesChanged(object sender, CollectionItemsChangedEventArgs e) { UpdateGrammarAndEncoding(); } private void ProblemDataParameter_ValueChanged(object sender, EventArgs e) { ProblemDataParameter.Value.Changed += ProblemData_Changed; OnProblemDataChanged(); OnReset(); } private void ProblemData_Changed(object sender, EventArgs e) { OnProblemDataChanged(); OnReset(); } private void OnProblemDataChanged() { UpdateTargetVariables(); // implicitly updates other dependent parameters var handler = ProblemDataChanged; if (handler != null) handler(this, EventArgs.Empty); } #endregion #region helper private void InitAllParameters() { UpdateTargetVariables(); // implicitly updates the grammar and the encoding } private ReadOnlyCheckedItemCollection CreateFunctionSet() { var l = new CheckedItemCollection(); l.Add(new StringValue("+").AsReadOnly()); l.Add(new StringValue("*").AsReadOnly()); l.Add(new StringValue("%").AsReadOnly()); l.Add(new StringValue("-").AsReadOnly()); l.Add(new StringValue("sin").AsReadOnly()); l.Add(new StringValue("cos").AsReadOnly()); return l.AsReadOnly(); } private static bool IsConstantNode(ISymbolicExpressionTreeNode n) { return n.Symbol.Name.StartsWith("θ"); } private static bool IsLatentVariableNode(ISymbolicExpressionTreeNode n) { return n.Symbol.Name.StartsWith("λ"); } private void UpdateTargetVariables() { var currentlySelectedVariables = TargetVariables.CheckedItems.Select(i => i.Value).ToArray(); var newVariablesList = new CheckedItemCollection(ProblemData.Dataset.VariableNames.Select(str => new StringValue(str).AsReadOnly()).ToArray()).AsReadOnly(); var matchingItems = newVariablesList.Where(item => currentlySelectedVariables.Contains(item.Value)).ToArray(); foreach (var matchingItem in matchingItems) { newVariablesList.SetItemCheckedState(matchingItem, true); } TargetVariablesParameter.Value = newVariablesList; } private void UpdateGrammarAndEncoding() { var encoding = new MultiEncoding(); var g = CreateGrammar(); foreach (var targetVar in TargetVariables.CheckedItems) { encoding = encoding.Add(new SymbolicExpressionTreeEncoding(targetVar + "_tree", g, MaximumLength, MaximumLength)); // only limit by length } for (int i = 1; i <= NumberOfLatentVariables; i++) { encoding = encoding.Add(new SymbolicExpressionTreeEncoding("λ" + i + "_tree", g, MaximumLength, MaximumLength)); } Encoding = encoding; } private ISymbolicExpressionGrammar CreateGrammar() { // whenever ProblemData is changed we create a new grammar with the necessary symbols var g = new SimpleSymbolicExpressionGrammar(); g.AddSymbols(FunctionSet.CheckedItems.Select(i => i.Value).ToArray(), 2, 2); // TODO //g.AddSymbols(new[] { // "exp", // "log", // log( ) // TODO: init a theta to ensure the value is always positive // "exp_minus" // exp((-1) * //}, 1, 1); foreach (var variableName in ProblemData.AllowedInputVariables.Union(TargetVariables.CheckedItems.Select(i => i.Value))) g.AddTerminalSymbol(variableName); // generate symbols for numeric parameters for which the value is optimized using AutoDiff // we generate multiple symbols to balance the probability for selecting a numeric parameter in the generation of random trees var numericConstantsFactor = 2.0; for (int i = 0; i < numericConstantsFactor * (ProblemData.AllowedInputVariables.Count() + TargetVariables.CheckedItems.Count()); i++) { g.AddTerminalSymbol("θ" + i); // numeric parameter for which the value is optimized using AutoDiff } // generate symbols for latent variables for (int i = 1; i <= NumberOfLatentVariables; i++) { g.AddTerminalSymbol("λ" + i); // numeric parameter for which the value is optimized using AutoDiff } return g; } #endregion #region Import & Export public void Load(IRegressionProblemData data) { Name = data.Name; Description = data.Description; ProblemData = data; } public IRegressionProblemData Export() { return ProblemData; } #endregion } }