Changeset 15970


Ignore:
Timestamp:
06/22/18 09:47:35 (3 years ago)
Author:
gkronber
Message:

#2925: added expressions for latent variables and allow parameterization of the number of integration steps

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/2925_AutoDiffForDynamicalModels/HeuristicLab.Problems.DynamicalSystemsModelling/3.3/Problem.cs

    r15968 r15970  
    112112    private const string MaximumLengthParameterName = "Size limit";
    113113    private const string MaximumParameterOptimizationIterationsParameterName = "Max. parameter optimization iterations";
     114    private const string NumberOfLatentVariablesParameterName = "Number of latent variables";
     115    private const string NumericIntegrationStepsParameterName = "Steps for numeric integration";
    114116    #endregion
    115117
     
    131133    public IFixedValueParameter<IntValue> MaximumParameterOptimizationIterationsParameter {
    132134      get { return (IFixedValueParameter<IntValue>)Parameters[MaximumParameterOptimizationIterationsParameterName]; }
     135    }
     136    public IFixedValueParameter<IntValue> NumberOfLatentVariablesParameter {
     137      get { return (IFixedValueParameter<IntValue>)Parameters[NumberOfLatentVariablesParameterName]; }
     138    }
     139    public IFixedValueParameter<IntValue> NumericIntegrationStepsParameter {
     140      get { return (IFixedValueParameter<IntValue>)Parameters[NumericIntegrationStepsParameterName]; }
    133141    }
    134142    #endregion
     
    155163      get { return MaximumParameterOptimizationIterationsParameter.Value.Value; }
    156164    }
     165    public int NumberOfLatentVariables {
     166      get { return NumberOfLatentVariablesParameter.Value.Value; }
     167    }
     168    public int NumericIntegrationSteps {
     169      get { return NumericIntegrationStepsParameter.Value.Value; }
     170    }
     171
    157172    #endregion                                                                                     
    158173
     
    184199      var targetVariables = new CheckedItemCollection<StringValue>().AsReadOnly(); // HACK: it would be better to provide a new class derived from IDataAnalysisProblem
    185200      var functions = CreateFunctionSet();
    186       Parameters.Add(new ValueParameter<IRegressionProblemData>(ProblemDataParameterName, "The data captured from the dynamical system", new RegressionProblemData()));
     201      Parameters.Add(new ValueParameter<IRegressionProblemData>(ProblemDataParameterName, "The data captured from the dynamical system. Use CSV import functionality to import data.", new RegressionProblemData()));
    187202      Parameters.Add(new ValueParameter<ReadOnlyCheckedItemCollection<StringValue>>(TargetVariablesParameterName, "Target variables (overrides setting in ProblemData)", targetVariables));
    188203      Parameters.Add(new ValueParameter<ReadOnlyCheckedItemCollection<StringValue>>(FunctionSetParameterName, "The list of allowed functions", functions));
    189       Parameters.Add(new FixedValueParameter<IntValue>(MaximumLengthParameterName, "The maximally allowed length of each expression", new IntValue(20)));
    190       Parameters.Add(new FixedValueParameter<IntValue>(MaximumParameterOptimizationIterationsParameterName, "The maximum number of iterations for optimization of parameters (using L-BFGS)", new IntValue(100)));
     204      Parameters.Add(new FixedValueParameter<IntValue>(MaximumLengthParameterName, "The maximally allowed length of each expression. Set to a small value (5 - 25). Default = 10", new IntValue(10)));
     205      Parameters.Add(new FixedValueParameter<IntValue>(MaximumParameterOptimizationIterationsParameterName, "The maximum number of iterations for optimization of parameters (using L-BFGS). More iterations makes the algorithm slower, fewer iterations might prevent convergence in the optimization scheme. Default = 100", new IntValue(100)));
     206      Parameters.Add(new FixedValueParameter<IntValue>(NumberOfLatentVariablesParameterName, "Latent variables (unobserved variables) allow us to produce expressions which are integrated up and can be used in other expressions. They are handled similarly to target variables in forward simulation / integration. The difference to target variables is that there are no data to which the calculated values of latent variables are compared. Set to a small value (0 .. 5) as necessary (default = 0)", new IntValue(0)));
     207      Parameters.Add(new FixedValueParameter<IntValue>(NumericIntegrationStepsParameterName, "Number of steps in the numeric integration that are taken from one row to the next (set to 1 to 100). More steps makes the algorithm slower, less steps worsens the accuracy of the numeric integration scheme.", new IntValue(10)));
    191208
    192209      RegisterEventHandlers();
     
    201218      var rows = ProblemData.TrainingIndices.ToArray();
    202219      var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
    203       var targetValues = new double[rows.Length,targetVars.Length];
    204      
     220      var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees
     221      var targetValues = new double[rows.Length, targetVars.Length];
     222
    205223      // collect values of all target variables
    206224      var colIdx = 0;
    207       foreach(var targetVar in targetVars) {
     225      foreach (var targetVar in targetVars) {
    208226        int rowIdx = 0;
    209         foreach(var value in problemData.Dataset.GetDoubleValues(targetVar, rows)) {
     227        foreach (var value in problemData.Dataset.GetDoubleValues(targetVar, rows)) {
    210228          targetValues[rowIdx, colIdx] = value;
    211229          rowIdx++;
     
    230248        alglib.minlbfgscreate(Math.Min(theta.Length, 5), theta, out state);
    231249        alglib.minlbfgssetcond(state, 0.0, 0.0, 0.0, MaximumParameterOptimizationIterations);
    232         alglib.minlbfgsoptimize(state, EvaluateObjectiveAndGradient, null, new object[] { trees, targetVars, problemData, nodeIdx, targetValues, rows }); //TODO: create a type
     250        alglib.minlbfgsoptimize(state, EvaluateObjectiveAndGradient, null,
     251          new object[] { trees, targetVars, problemData, nodeIdx, targetValues, rows, NumericIntegrationSteps, latentVariables }); //TODO: create a type
    233252        alglib.minlbfgsresults(state, out optTheta, out report);
    234253
     
    265284      double[] grad = new double[optTheta.Length];
    266285      double optQuality = double.NaN;
    267       EvaluateObjectiveAndGradient(optTheta, ref optQuality, grad, new object[] { trees, targetVars, problemData, nodeIdx, targetValues, rows });
     286      EvaluateObjectiveAndGradient(optTheta, ref optQuality, grad,
     287        new object[] { trees, targetVars, problemData, nodeIdx, targetValues, rows, NumericIntegrationSteps, latentVariables });
    268288      if (double.IsNaN(optQuality) || double.IsInfinity(optQuality)) return 10E6; // return a large value (TODO: be consistent by using NMSE)
    269289
     
    279299      var targetValues = (double[,])((object[])obj)[4];
    280300      var rows = (int[])((object[])obj)[5];
     301      var numericIntegrationSteps = (int)((object[])obj)[6];
     302      var latentVariables = (string[])((object[])obj)[7];
    281303
    282304      var predicted = Integrate(
     
    285307        problemData.AllowedInputVariables.ToArray(),
    286308        targetVariables,
     309        latentVariables,
    287310        rows,
    288311        nodeIdx,                // TODO: is it Ok to use rows here ?
    289         x).ToArray();
     312        x, numericIntegrationSteps).ToArray();
    290313
    291314
     
    305328      foreach (var y_pred in predicted) {
    306329        // TODO NMSE to put the same weight on each target regardless of the value range;
    307         for(int c = 0;c<y_pred.Length;c++) {
    308          
     330        for (int c = 0; c < y_pred.Length; c++) {
     331
    309332          var y_pred_f = y_pred[c].Item1;
    310           var y = targetValues[r,c];
     333          var y = targetValues[r, c];
    311334
    312335          var res = (y - y_pred_f);
     
    336359      // TODO extract common functionality from Evaluate and Analyze
    337360      var bestIndividualAndQuality = this.GetBestIndividual(individuals, qualities);
    338       var optTheta = ((DoubleArray) bestIndividualAndQuality.Item1["OptTheta"]).ToArray(); // see evaluate
     361      var optTheta = ((DoubleArray)bestIndividualAndQuality.Item1["OptTheta"]).ToArray(); // see evaluate
    339362      var trees = bestIndividualAndQuality.Item1.Values.Select(v => v.Value).OfType<ISymbolicExpressionTree>().ToArray(); // extract all trees from individual
    340363      var nodeIdx = new Dictionary<ISymbolicExpressionTreeNode, int>();
     
    348371      var problemData = ProblemData;
    349372      var targetVars = TargetVariables.CheckedItems.Select(i => i.Value).ToArray();
     373      var latentVariables = Enumerable.Range(1, NumberOfLatentVariables).Select(i => "λ" + i).ToArray(); // TODO: must coincide with the variables which are actually defined in the grammar and also for which we actually have trees
    350374
    351375      var trainingList = new ItemList<DataTable>();
     
    356380       problemData.AllowedInputVariables.ToArray(),
    357381       targetVars,
     382       latentVariables,
    358383       trainingRows,
    359384       nodeIdx,
    360        optTheta).ToArray();
     385       optTheta,
     386       NumericIntegrationSteps).ToArray();
    361387
    362388      for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
    363389        var targetVar = targetVars[colIdx];
    364         var trainingDataTable = new DataTable(targetVar+ " prediction (training)");
     390        var trainingDataTable = new DataTable(targetVar + " prediction (training)");
    365391        var actualValuesRow = new DataRow(targetVar, "The values of " + targetVar, problemData.Dataset.GetDoubleValues(targetVar, trainingRows));
    366392        var predictedValuesRow = new DataRow(targetVar + " pred.", "Predicted values for " + targetVar, trainingPrediction.Select(arr => arr[colIdx].Item1).ToArray());
     
    378404       problemData.AllowedInputVariables.ToArray(),
    379405       targetVars,
     406       latentVariables,
    380407       testRows,
    381408       nodeIdx,
    382        optTheta).ToArray();
     409       optTheta,
     410       NumericIntegrationSteps).ToArray();
    383411
    384412      for (int colIdx = 0; colIdx < targetVars.Length; colIdx++) {
     
    400428    #region interpretation
    401429    private static IEnumerable<Tuple<double, Vector>[]> Integrate(
    402       ISymbolicExpressionTree[] trees, IDataset dataset, string[] inputVariables, string[] targetVariables, IEnumerable<int> rows,
    403       Dictionary<ISymbolicExpressionTreeNode, int> nodeIdx, double[] parameterValues) {
    404 
    405       int NUM_STEPS = 1;
     430      ISymbolicExpressionTree[] trees, IDataset dataset, string[] inputVariables, string[] targetVariables, string[] latentVariables, IEnumerable<int> rows,
     431      Dictionary<ISymbolicExpressionTreeNode, int> nodeIdx, double[] parameterValues, int numericIntegrationSteps = 100) {
     432
     433      int NUM_STEPS = numericIntegrationSteps ;
    406434      double h = 1.0 / NUM_STEPS;
    407435
    408436      // return first value as stored in the dataset
    409 
    410437      yield return targetVariables
    411438        .Select(targetVar => Tuple.Create(dataset.GetDoubleValue(targetVar, rows.First()), Vector.Zero))
     
    422449        variableValues.Add(varName, Tuple.Create(dataset.GetDoubleValue(varName, t0), Vector.Zero));
    423450      }
     451      // add value entries for latent variables which are also integrated
     452      foreach(var latentVar in latentVariables) {
     453        variableValues.Add(latentVar, Tuple.Create(0.0, Vector.Zero)); // we don't have observations for latent variables -> assume zero as starting value
     454      }
     455      var calculatedVariables = targetVariables.Concat(latentVariables); // TODO: must conincide with the order of trees in the encoding
    424456
    425457      foreach (var t in rows.Skip(1)) {
    426458        for (int step = 0; step < NUM_STEPS; step++) {
    427459          var deltaValues = new Dictionary<string, Tuple<double, Vector>>();
    428           foreach (var tup in trees.Zip(targetVariables, Tuple.Create)) {
     460          foreach (var tup in trees.Zip(calculatedVariables, Tuple.Create)) {
    429461            var tree = tup.Item1;
    430462            var targetVarName = tup.Item2;
     
    444476        }
    445477
     478        // only return the target variables for calculation of errors
    446479        yield return targetVariables
    447480          .Select(targetVar => variableValues[targetVar])
     
    464497      switch (node.Symbol.Name) {
    465498        case "+": {
    466             var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues);
     499            var l = InterpretRec(node.GetSubtree(0), variableValues, nodeIdx, parameterValues); // TODO capture all parameters into a state type for interpretation
    467500            var r = InterpretRec(node.GetSubtree(1), variableValues, nodeIdx, parameterValues);
    468501
     
    520553     *    |
    521554     *    V
    522      * TargetVariables   FunctionSet    MaximumLength
    523      *               |   |                 |
    524      *               V   V                 |
    525      *             Grammar <---------------+ 
     555     * TargetVariables   FunctionSet    MaximumLength    NumberOfLatentVariables
     556     *               |   |                 |                   |
     557     *               V   V                 |                   |
     558     *             Grammar <---------------+-------------------
    526559     *                |
    527560     *                V
     
    539572
    540573      MaximumLengthParameter.Value.ValueChanged += MaximumLengthChanged;
     574
     575      NumberOfLatentVariablesParameter.Value.ValueChanged += NumLatentVariablesChanged;
     576    }
     577
     578    private void NumLatentVariablesChanged(object sender, EventArgs e) {
     579      UpdateGrammarAndEncoding();
    541580    }
    542581
     
    599638      return n.Symbol.Name.StartsWith("θ");
    600639    }
     640    private static bool IsLatentVariableNode(ISymbolicExpressionTreeNode n) {
     641      return n.Symbol.Name.StartsWith("λ");
     642    }
    601643
    602644
     
    616658      var g = CreateGrammar();
    617659      foreach (var targetVar in TargetVariables.CheckedItems) {
    618         encoding = encoding.Add(new SymbolicExpressionTreeEncoding(targetVar+"_tree",g, MaximumLength, MaximumLength)); // only limit by length
     660        encoding = encoding.Add(new SymbolicExpressionTreeEncoding(targetVar + "_tree", g, MaximumLength, MaximumLength)); // only limit by length
     661      }
     662      for (int i = 1; i <= NumberOfLatentVariables; i++) {
     663        encoding = encoding.Add(new SymbolicExpressionTreeEncoding("λ" + i + "_tree", g, MaximumLength, MaximumLength));
    619664      }
    620665      Encoding = encoding;
     
    642687        g.AddTerminalSymbol("θ" + i); // numeric parameter for which the value is optimized using AutoDiff
    643688      }
     689
     690      // generate symbols for latent variables
     691      for (int i = 1; i <= NumberOfLatentVariables; i++) {
     692        g.AddTerminalSymbol("λ" + i); // numeric parameter for which the value is optimized using AutoDiff
     693      }
     694
    644695      return g;
    645696    }
Note: See TracChangeset for help on using the changeset viewer.