Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2847_M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/m5.patch @ 16848

Last change on this file since 16848 was 15830, checked in by bwerth, 6 years ago

#2847 adapted project to new rep structure; major changes to interfaces; restructures splitting and pruning

File size: 108.4 KB
  • Interfaces/IM5Model.cs

     
    2424using HeuristicLab.Problems.DataAnalysis;
    2525
    2626namespace HeuristicLab.Algorithms.DataAnalysis {
    27   internal interface IM5Model : IRegressionModel {
     27  public interface IM5Model : IRegressionModel {
    2828    void Build(IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken);
    2929    void Update(IReadOnlyList<int> rows, M5Parameters m5Parameters, CancellationToken cancellationToken);
    3030  }
  • Interfaces/IPruning.cs

     
    1919 */
    2020#endregion
    2121using System.Collections.Generic;
     22using System.Threading;
    2223using HeuristicLab.Core;
    2324using HeuristicLab.Problems.DataAnalysis;
    2425
     
    2526namespace HeuristicLab.Algorithms.DataAnalysis {
    2627  public interface IPruning : IParameterizedNamedItem {
    2728    int MinLeafSize(IRegressionProblemData pd, ILeafModel leafModel);
     29
     30    void Prune(M5TreeModel treeModel, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken);
    2831  }
    2932}
     33 No newline at end of file
  • Interfaces/ISplitter.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21using HeuristicLab.Core;
     22using HeuristicLab.Problems.DataAnalysis;
     23
     24namespace HeuristicLab.Algorithms.DataAnalysis {
     25  public interface ISplitter : IParameterizedNamedItem {
     26    /// <summary>
     27    /// decides wether a node sould be split
     28    /// and if so at which attribute and which value
     29    /// </summary>
     30    /// <param name="splitData"></param>
     31    /// <param name="minLeafSize"></param>
     32    /// <param name="splitAttr"></param>
     33    /// <param name="splitValue"></param>
     34    /// <returns></returns>
     35    bool Split(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue);
     36  }
     37}
     38 No newline at end of file
  • LeafModels/ComponentReducedLinearModel.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System.Collections.Generic;
     23using System.Linq;
     24using HeuristicLab.Common;
     25using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     26using HeuristicLab.Problems.DataAnalysis;
     27
     28namespace HeuristicLab.Algorithms.DataAnalysis {
     29  [StorableClass]
     30  public class ComponentReducedLinearModel : RegressionModel, IConfidenceRegressionModel {
     31    [Storable]
     32    private IConfidenceRegressionModel Model;
     33    [Storable]
     34    private PrincipleComponentTransformation Pca;
     35
     36    [StorableConstructor]
     37    private ComponentReducedLinearModel(bool deserializing) : base(deserializing) { }
     38    private ComponentReducedLinearModel(ComponentReducedLinearModel original, Cloner cloner) : base(original, cloner) {
     39      Model = cloner.Clone(original.Model);
     40      Pca = cloner.Clone(original.Pca);
     41    }
     42    public ComponentReducedLinearModel(string targetVariable, IConfidenceRegressionModel model, PrincipleComponentTransformation pca) : base(targetVariable) {
     43      Model = model;
     44      Pca = pca;
     45    }
     46    public override IDeepCloneable Clone(Cloner cloner) {
     47      return new ComponentReducedLinearModel(this, cloner);
     48    }
     49
     50    public override IEnumerable<string> VariablesUsedForPrediction {
     51      get { return Model.VariablesUsedForPrediction; }
     52    }
     53    public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
     54      var data = ReduceDataset(dataset, rows.ToArray());
     55      return Model.GetEstimatedValues(Pca.TransformDataset(data), Enumerable.Range(0, data.Rows));
     56    }
     57    public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
     58      return new ConfidenceRegressionSolution(this, problemData);
     59    }
     60    public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) {
     61      var data = ReduceDataset(dataset, rows.ToArray());
     62      return Model.GetEstimatedVariances(Pca.TransformDataset(data), Enumerable.Range(0, data.Rows));
     63    }
     64
     65    private IDataset ReduceDataset(IDataset data, IReadOnlyList<int> rows) {
     66      return new Dataset(data.DoubleVariables, data.DoubleVariables.Select(v => data.GetDoubleValues(v, rows).ToList()));
     67    }
     68  }
     69}
     70 No newline at end of file
  • LeafModels/DampenedLinearModel.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Linq;
     25using HeuristicLab.Common;
     26using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     27using HeuristicLab.Problems.DataAnalysis;
     28
     29namespace HeuristicLab.Algorithms.DataAnalysis {
     30  //mulitdimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf
     31  [StorableClass]
     32  public class DampenedLinearModel : RegressionModel, IConfidenceRegressionModel {
     33    [Storable]
     34    private IConfidenceRegressionModel Model;
     35    [Storable]
     36    private double Min;
     37    [Storable]
     38    private double Max;
     39    [Storable]
     40    private double Dampening;
     41
     42    [StorableConstructor]
     43    private DampenedLinearModel(bool deserializing) : base(deserializing) { }
     44    private DampenedLinearModel(DampenedLinearModel original, Cloner cloner) : base(original, cloner) {
     45      Model = cloner.Clone(original.Model);
     46      Min = original.Min;
     47      Max = original.Max;
     48      Dampening = original.Dampening;
     49    }
     50    public DampenedLinearModel(IConfidenceRegressionModel model, IRegressionProblemData pd, double dampening) : base(model.TargetVariable) {
     51      Model = model;
     52      Min = pd.TargetVariableTrainingValues.Min();
     53      Max = pd.TargetVariableTrainingValues.Max();
     54      Dampening = dampening;
     55    }
     56    public override IDeepCloneable Clone(Cloner cloner) {
     57      return new DampenedLinearModel(this, cloner);
     58    }
     59    public override IEnumerable<string> VariablesUsedForPrediction {
     60      get { return Model.VariablesUsedForPrediction; }
     61    }
     62    public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
     63      var slow = Sigmoid(-Dampening);
     64      var shigh = Sigmoid(Dampening);
     65      foreach (var x in Model.GetEstimatedValues(dataset, rows)) {
     66        var y = Rescale(x, Min, Max, -Dampening, Dampening);
     67        y = Sigmoid(y);
     68        y = Rescale(y, slow, shigh, Min, Max);
     69        yield return y;
     70      }
     71    }
     72    public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
     73      return new ConfidenceRegressionSolution(this, problemData);
     74    }
     75    public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) {
     76      return Model.GetEstimatedVariances(dataset, rows);
     77    }
     78
     79    private static double Rescale(double x, double oMin, double oMax, double nMin, double nMax) {
     80      var d = oMax - oMin;
     81      var nd = nMax - nMin;
     82      if (d.IsAlmost(0)) {
     83        d = 1;
     84        nMin += nd / 2;
     85        nd = 0;
     86      }
     87      return ((x - oMin) / d) * nd + nMin;
     88    }
     89    private static double Sigmoid(double x) {
     90      return 1 / (1 + Math.Exp(-x));
     91    }
     92  }
     93}
     94 No newline at end of file
  • LeafModels/PreconstructedLinearModel.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Diagnostics;
     25using System.Linq;
     26using HeuristicLab.Common;
     27using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     28using HeuristicLab.Problems.DataAnalysis;
     29
     30namespace HeuristicLab.Algorithms.DataAnalysis {
     31  //mulitdimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf
     32  [StorableClass]
     33  internal sealed class PreconstructedLinearModel : RegressionModel, IConfidenceRegressionModel {
     34    [Storable]
     35    public Dictionary<string, double> Coefficients { get; private set; }
     36    [Storable]
     37    public double Intercept { get; private set; }
     38    [Storable]
     39    private Dictionary<string, double> Means { get; set; }
     40    [Storable]
     41    private Dictionary<string, double> Variances { get; set; }
     42    [Storable]
     43    private double ResidualVariance { get; set; }
     44    [Storable]
     45    private int SampleSize { get; set; }
     46
     47    public override IEnumerable<string> VariablesUsedForPrediction {
     48      get { return Coefficients.Keys; }
     49    }
     50    #region HLConstructors
     51    [StorableConstructor]
     52    private PreconstructedLinearModel(bool deserializing) : base(deserializing) { }
     53    private PreconstructedLinearModel(PreconstructedLinearModel original, Cloner cloner) : base(original, cloner) {
     54      if (original.Coefficients != null) Coefficients = original.Coefficients.ToDictionary(x => x.Key, x => x.Value);
     55      Intercept = original.Intercept;
     56      if (original.Means != null) Means = original.Means.ToDictionary(x => x.Key, x => x.Value);
     57      if (original.Variances != null) Variances = original.Variances.ToDictionary(x => x.Key, x => x.Value);
     58      ResidualVariance = original.ResidualVariance;
     59      SampleSize = original.SampleSize;
     60    }
     61    public PreconstructedLinearModel(Dictionary<string, double> means, Dictionary<string, double> variances, Dictionary<string, double> coefficients, double intercept, string targetvariable) : base(targetvariable) {
     62      Coefficients = coefficients;
     63      Intercept = intercept;
     64      Variances = variances;
     65      Means = means;
     66      ResidualVariance = 0;
     67      SampleSize = 0;
     68    }
     69    public PreconstructedLinearModel(double intercept, string targetvariable) : base(targetvariable) {
     70      Coefficients = new Dictionary<string, double>();
     71      Intercept = intercept;
     72      Variances = new Dictionary<string, double>();
     73      ResidualVariance = 0;
     74      SampleSize = 0;
     75    }
     76    public override IDeepCloneable Clone(Cloner cloner) {
     77      return new PreconstructedLinearModel(this, cloner);
     78    }
     79    #endregion
     80
     81    public static PreconstructedLinearModel CreateConfidenceLinearModel(IRegressionProblemData pd, out double rmse, out double cvRmse) {
     82      rmse = double.NaN;
     83      cvRmse = double.NaN;
     84      return AlternativeCalculation(pd);
     85    }
     86
     87    private static PreconstructedLinearModel ClassicCalculation(IRegressionProblemData pd, out double rmse, out double cvRmse) {
     88      var inputMatrix = pd.Dataset.ToArray(pd.AllowedInputVariables.Concat(new[] {
     89        pd.TargetVariable
     90      }), pd.AllIndices);
     91
     92      var nFeatures = inputMatrix.GetLength(1) - 1;
     93      double[] coefficients;
     94
     95      alglib.linearmodel lm;
     96      alglib.lrreport ar;
     97      int retVal;
     98      alglib.lrbuild(inputMatrix, inputMatrix.GetLength(0), nFeatures, out retVal, out lm, out ar);
     99      if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution");
     100      rmse = ar.rmserror;
     101      cvRmse = ar.cvrmserror;
     102
     103      alglib.lrunpack(lm, out coefficients, out nFeatures);
     104
     105
     106      var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Average());
     107      var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Variance());
     108      var coeffs = pd.AllowedInputVariables.Zip(coefficients, (s, d) => new {s, d}).ToDictionary(x => x.s, x => x.d);
     109      var res = new PreconstructedLinearModel(means, variances, coeffs, coefficients[nFeatures], pd.TargetVariable);
     110
     111      res.ResidualVariance = pd.TargetVariableValues.Zip(res.GetEstimatedValues(pd.Dataset, pd.TrainingIndices), (x, y) => x - y).Variance();
     112      res.SampleSize = pd.TrainingIndices.Count();
     113      return res;
     114    }
     115
     116    private static PreconstructedLinearModel AlternativeCalculation(IRegressionProblemData pd) {
     117      var means = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Average());
     118      var variances = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Variance());
     119      var cmean = pd.TargetVariableTrainingValues.Average();
     120      var variables = pd.AllowedInputVariables.ToList();
     121      var n = variables.Count;
     122      var m = pd.TrainingIndices.Count();
     123
     124      //Set up X^T and y
     125      var inTr = new double[n + 1, m];
     126      for (var i = 0; i < n; i++) {
     127        var v = variables[i];
     128        var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray();
     129        for (var j = 0; j < m; j++) inTr[i, j] = vdata[j];
     130      }
     131
     132      for (var i = 0; i < m; i++) inTr[n, i] = 1;
     133
     134      var y = new double[m, 1];
     135      var ydata = pd.TargetVariableTrainingValues.ToArray();
     136      for (var i = 0; i < m; i++) y[i, 0] = ydata[i];
     137
     138      //Perform linear regression
     139      var aTy = new double[n + 1, 1];
     140      alglib.rmatrixgemm(n + 1, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y;
     141      var aTa = new double[n + 1, n + 1];
     142      alglib.rmatrixgemm(n + 1, n + 1, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa //
     143      alglib.spdmatrixcholesky(ref aTa, n + 1, true);
     144      int info;
     145      alglib.densesolverreport report;
     146      double[] coefficients;
     147      var aTyVector = new double[n + 1];
     148      for (var i = 0; i < n + 1; i++) aTyVector[i] = aTy[i, 0];
     149      alglib.spdmatrixcholeskysolve(aTa, n + 1, true, aTyVector, out info, out report, out coefficients);
     150      double rmse, cvrmse;
     151      if (info != 1) return ClassicCalculation(pd, out rmse, out cvrmse);
     152
     153      //extract coefficients
     154      var intercept = coefficients[n];
     155      var coeffs = new Dictionary<string, double>();
     156      for (var i = 0; i < n; i++) coeffs.Add(variables[i], coefficients[i]);
     157
     158      return new PreconstructedLinearModel(means, variances, coeffs, intercept, pd.TargetVariable);
     159    }
     160
     161    public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
     162      return rows.Select(row => GetEstimatedValue(dataset, row));
     163    }
     164
     165    public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
     166      return new RegressionSolution(this, problemData);
     167    }
     168
     169    public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) {
     170      return rows.Select(i => GetEstimatedVariance(dataset, i));
     171    }
     172
     173    #region helpers
     174    private double GetEstimatedValue(IDataset dataset, int row) {
     175      return Intercept + (Coefficients.Count == 0 ? 0 : Coefficients.Sum(s => s.Value * dataset.GetDoubleValue(s.Key, row)));
     176    }
     177    private double GetEstimatedVariance(IDataset dataset, int row) {
     178      if (SampleSize == 0) return 0.0;
     179      var sum = (from var in Variances let d = dataset.GetDoubleValue(var.Key, row) - Means[var.Key] select d * d / var.Value).Sum();
     180      var res = ResidualVariance * (1.0 / SampleSize + sum / (SampleSize - 1));
     181      if (double.IsInfinity(res) || double.IsNaN(res)) return 0.0;
     182      return res;
     183    }
     184    #endregion
     185  }
     186}
     187 No newline at end of file
  • LeafModels/ComponentReducedLinearModel.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System.Collections.Generic;
     23using System.Linq;
     24using HeuristicLab.Common;
     25using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     26using HeuristicLab.Problems.DataAnalysis;
     27
     28namespace HeuristicLab.Algorithms.DataAnalysis {
     29  [StorableClass]
     30  public class ComponentReducedLinearModel : RegressionModel, IConfidenceRegressionModel {
     31    [Storable]
     32    private IConfidenceRegressionModel Model;
     33    [Storable]
     34    private PrincipleComponentTransformation Pca;
     35
     36    [StorableConstructor]
     37    private ComponentReducedLinearModel(bool deserializing) : base(deserializing) { }
     38    private ComponentReducedLinearModel(ComponentReducedLinearModel original, Cloner cloner) : base(original, cloner) {
     39      Model = cloner.Clone(original.Model);
     40      Pca = cloner.Clone(original.Pca);
     41    }
     42    public ComponentReducedLinearModel(string targetVariable, IConfidenceRegressionModel model, PrincipleComponentTransformation pca) : base(targetVariable) {
     43      Model = model;
     44      Pca = pca;
     45    }
     46    public override IDeepCloneable Clone(Cloner cloner) {
     47      return new ComponentReducedLinearModel(this, cloner);
     48    }
     49
     50    public override IEnumerable<string> VariablesUsedForPrediction {
     51      get { return Model.VariablesUsedForPrediction; }
     52    }
     53    public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
     54      var data = ReduceDataset(dataset, rows.ToArray());
     55      return Model.GetEstimatedValues(Pca.TransformDataset(data), Enumerable.Range(0, data.Rows));
     56    }
     57    public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
     58      return new ConfidenceRegressionSolution(this, problemData);
     59    }
     60    public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) {
     61      var data = ReduceDataset(dataset, rows.ToArray());
     62      return Model.GetEstimatedVariances(Pca.TransformDataset(data), Enumerable.Range(0, data.Rows));
     63    }
     64
     65    private IDataset ReduceDataset(IDataset data, IReadOnlyList<int> rows) {
     66      return new Dataset(data.DoubleVariables, data.DoubleVariables.Select(v => data.GetDoubleValues(v, rows).ToList()));
     67    }
     68  }
     69}
     70 No newline at end of file
  • LeafModels/DampenedLinearModel.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Linq;
     25using HeuristicLab.Common;
     26using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     27using HeuristicLab.Problems.DataAnalysis;
     28
     29namespace HeuristicLab.Algorithms.DataAnalysis {
     30  //mulitdimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf
     31  [StorableClass]
     32  public class DampenedLinearModel : RegressionModel, IConfidenceRegressionModel {
     33    [Storable]
     34    private IConfidenceRegressionModel Model;
     35    [Storable]
     36    private double Min;
     37    [Storable]
     38    private double Max;
     39    [Storable]
     40    private double Dampening;
     41
     42    [StorableConstructor]
     43    private DampenedLinearModel(bool deserializing) : base(deserializing) { }
     44    private DampenedLinearModel(DampenedLinearModel original, Cloner cloner) : base(original, cloner) {
     45      Model = cloner.Clone(original.Model);
     46      Min = original.Min;
     47      Max = original.Max;
     48      Dampening = original.Dampening;
     49    }
     50    public DampenedLinearModel(IConfidenceRegressionModel model, IRegressionProblemData pd, double dampening) : base(model.TargetVariable) {
     51      Model = model;
     52      Min = pd.TargetVariableTrainingValues.Min();
     53      Max = pd.TargetVariableTrainingValues.Max();
     54      Dampening = dampening;
     55    }
     56    public override IDeepCloneable Clone(Cloner cloner) {
     57      return new DampenedLinearModel(this, cloner);
     58    }
     59    public override IEnumerable<string> VariablesUsedForPrediction {
     60      get { return Model.VariablesUsedForPrediction; }
     61    }
     62    public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
     63      var slow = Sigmoid(-Dampening);
     64      var shigh = Sigmoid(Dampening);
     65      foreach (var x in Model.GetEstimatedValues(dataset, rows)) {
     66        var y = Rescale(x, Min, Max, -Dampening, Dampening);
     67        y = Sigmoid(y);
     68        y = Rescale(y, slow, shigh, Min, Max);
     69        yield return y;
     70      }
     71    }
     72    public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
     73      return new ConfidenceRegressionSolution(this, problemData);
     74    }
     75    public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) {
     76      return Model.GetEstimatedVariances(dataset, rows);
     77    }
     78
     79    private static double Rescale(double x, double oMin, double oMax, double nMin, double nMax) {
     80      var d = oMax - oMin;
     81      var nd = nMax - nMin;
     82      if (d.IsAlmost(0)) {
     83        d = 1;
     84        nMin += nd / 2;
     85        nd = 0;
     86      }
     87      return ((x - oMin) / d) * nd + nMin;
     88    }
     89    private static double Sigmoid(double x) {
     90      return 1 / (1 + Math.Exp(-x));
     91    }
     92  }
     93}
     94 No newline at end of file
  • LeafModels/PreconstructedLinearModel.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Diagnostics;
     25using System.Linq;
     26using HeuristicLab.Common;
     27using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     28using HeuristicLab.Problems.DataAnalysis;
     29
     30namespace HeuristicLab.Algorithms.DataAnalysis {
     31  //mulitdimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf
     32  [StorableClass]
     33  internal sealed class PreconstructedLinearModel : RegressionModel, IConfidenceRegressionModel {
     34    [Storable]
     35    public Dictionary<string, double> Coefficients { get; private set; }
     36    [Storable]
     37    public double Intercept { get; private set; }
     38    [Storable]
     39    private Dictionary<string, double> Means { get; set; }
     40    [Storable]
     41    private Dictionary<string, double> Variances { get; set; }
     42    [Storable]
     43    private double ResidualVariance { get; set; }
     44    [Storable]
     45    private int SampleSize { get; set; }
     46
     47    public override IEnumerable<string> VariablesUsedForPrediction {
     48      get { return Coefficients.Keys; }
     49    }
     50    #region HLConstructors
     51    [StorableConstructor]
     52    private PreconstructedLinearModel(bool deserializing) : base(deserializing) { }
     53    private PreconstructedLinearModel(PreconstructedLinearModel original, Cloner cloner) : base(original, cloner) {
     54      if (original.Coefficients != null) Coefficients = original.Coefficients.ToDictionary(x => x.Key, x => x.Value);
     55      Intercept = original.Intercept;
     56      if (original.Means != null) Means = original.Means.ToDictionary(x => x.Key, x => x.Value);
     57      if (original.Variances != null) Variances = original.Variances.ToDictionary(x => x.Key, x => x.Value);
     58      ResidualVariance = original.ResidualVariance;
     59      SampleSize = original.SampleSize;
     60    }
     61    public PreconstructedLinearModel(Dictionary<string, double> means, Dictionary<string, double> variances, Dictionary<string, double> coefficients, double intercept, string targetvariable) : base(targetvariable) {
     62      Coefficients = coefficients;
     63      Intercept = intercept;
     64      Variances = variances;
     65      Means = means;
     66      ResidualVariance = 0;
     67      SampleSize = 0;
     68    }
     69    public PreconstructedLinearModel(double intercept, string targetvariable) : base(targetvariable) {
     70      Coefficients = new Dictionary<string, double>();
     71      Intercept = intercept;
     72      Variances = new Dictionary<string, double>();
     73      ResidualVariance = 0;
     74      SampleSize = 0;
     75    }
     76    public override IDeepCloneable Clone(Cloner cloner) {
     77      return new PreconstructedLinearModel(this, cloner);
     78    }
     79    #endregion
     80
     81    public static PreconstructedLinearModel CreateConfidenceLinearModel(IRegressionProblemData pd, out double rmse, out double cvRmse) {
     82      rmse = double.NaN;
     83      cvRmse = double.NaN;
     84      return AlternativeCalculation(pd);
     85    }
     86
     87    private static PreconstructedLinearModel ClassicCalculation(IRegressionProblemData pd, out double rmse, out double cvRmse) {
     88      var inputMatrix = pd.Dataset.ToArray(pd.AllowedInputVariables.Concat(new[] {
     89        pd.TargetVariable
     90      }), pd.AllIndices);
     91
     92      var nFeatures = inputMatrix.GetLength(1) - 1;
     93      double[] coefficients;
     94
     95      alglib.linearmodel lm;
     96      alglib.lrreport ar;
     97      int retVal;
     98      alglib.lrbuild(inputMatrix, inputMatrix.GetLength(0), nFeatures, out retVal, out lm, out ar);
     99      if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution");
     100      rmse = ar.rmserror;
     101      cvRmse = ar.cvrmserror;
     102
     103      alglib.lrunpack(lm, out coefficients, out nFeatures);
     104
     105
     106      var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Average());
     107      var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Variance());
     108      var coeffs = pd.AllowedInputVariables.Zip(coefficients, (s, d) => new {s, d}).ToDictionary(x => x.s, x => x.d);
     109      var res = new PreconstructedLinearModel(means, variances, coeffs, coefficients[nFeatures], pd.TargetVariable);
     110
     111      res.ResidualVariance = pd.TargetVariableValues.Zip(res.GetEstimatedValues(pd.Dataset, pd.TrainingIndices), (x, y) => x - y).Variance();
     112      res.SampleSize = pd.TrainingIndices.Count();
     113      return res;
     114    }
     115
     116    private static PreconstructedLinearModel AlternativeCalculation(IRegressionProblemData pd) {
     117      var means = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Average());
     118      var variances = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Variance());
     119      var cmean = pd.TargetVariableTrainingValues.Average();
     120      var variables = pd.AllowedInputVariables.ToList();
     121      var n = variables.Count;
     122      var m = pd.TrainingIndices.Count();
     123
     124      //Set up X^T and y
     125      var inTr = new double[n + 1, m];
     126      for (var i = 0; i < n; i++) {
     127        var v = variables[i];
     128        var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray();
     129        for (var j = 0; j < m; j++) inTr[i, j] = vdata[j];
     130      }
     131
     132      for (var i = 0; i < m; i++) inTr[n, i] = 1;
     133
     134      var y = new double[m, 1];
     135      var ydata = pd.TargetVariableTrainingValues.ToArray();
     136      for (var i = 0; i < m; i++) y[i, 0] = ydata[i];
     137
     138      //Perform linear regression
     139      var aTy = new double[n + 1, 1];
     140      alglib.rmatrixgemm(n + 1, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y;
     141      var aTa = new double[n + 1, n + 1];
     142      alglib.rmatrixgemm(n + 1, n + 1, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa //
     143      alglib.spdmatrixcholesky(ref aTa, n + 1, true);
     144      int info;
     145      alglib.densesolverreport report;
     146      double[] coefficients;
     147      var aTyVector = new double[n + 1];
     148      for (var i = 0; i < n + 1; i++) aTyVector[i] = aTy[i, 0];
     149      alglib.spdmatrixcholeskysolve(aTa, n + 1, true, aTyVector, out info, out report, out coefficients);
     150      double rmse, cvrmse;
     151      if (info != 1) return ClassicCalculation(pd, out rmse, out cvrmse);
     152
     153      //extract coefficients
     154      var intercept = coefficients[n];
     155      var coeffs = new Dictionary<string, double>();
     156      for (var i = 0; i < n; i++) coeffs.Add(variables[i], coefficients[i]);
     157
     158      return new PreconstructedLinearModel(means, variances, coeffs, intercept, pd.TargetVariable);
     159    }
     160
     161    public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
     162      return rows.Select(row => GetEstimatedValue(dataset, row));
     163    }
     164
     165    public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
     166      return new RegressionSolution(this, problemData);
     167    }
     168
     169    public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) {
     170      return rows.Select(i => GetEstimatedVariance(dataset, i));
     171    }
     172
     173    #region helpers
     174    private double GetEstimatedValue(IDataset dataset, int row) {
     175      return Intercept + (Coefficients.Count == 0 ? 0 : Coefficients.Sum(s => s.Value * dataset.GetDoubleValue(s.Key, row)));
     176    }
     177    private double GetEstimatedVariance(IDataset dataset, int row) {
     178      if (SampleSize == 0) return 0.0;
     179      var sum = (from var in Variances let d = dataset.GetDoubleValue(var.Key, row) - Means[var.Key] select d * d / var.Value).Sum();
     180      var res = ResidualVariance * (1.0 / SampleSize + sum / (SampleSize - 1));
     181      if (double.IsInfinity(res) || double.IsNaN(res)) return 0.0;
     182      return res;
     183    }
     184    #endregion
     185  }
     186}
     187 No newline at end of file
  • LeafTypes/ComplexLeaf.cs

     
    6666      if (t == null) throw new ArgumentException("No RegressionSolution was provided by the algorithm");
    6767      return t.Model;
    6868    }
    69 
    7069    public int MinLeafSize(IRegressionProblemData pd) {
    7170      return 3;
    7271    }
  • LeafTypes/LinearLeaf.cs

     
    4949      if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model");
    5050      double rmse, cvRmse;
    5151      noParameters = pd.AllowedInputVariables.Count() + 1;
    52       return PreconstructedLinearModel.CreateConfidenceLinearModel(pd, out rmse, out cvRmse);
     52      var res = PreconstructedLinearModel.CreateConfidenceLinearModel(pd, out rmse, out cvRmse);
     53      return res;
    5354    }
    5455
    5556    public int MinLeafSize(IRegressionProblemData pd) {
    56       return pd.AllowedInputVariables.Count() + 2;
     57      return pd.AllowedInputVariables.Count() == 1 ? 2 : pd.AllowedInputVariables.Count() + 2;
    5758    }
    5859    #endregion
    5960  }
  • LeafTypes/LogisticLeaf.cs

     
    5858      get { return true; }
    5959    }
    6060    public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) {
    61       if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model");
    62       double rmse, cvRmse;
    63       noParameters = pd.AllowedInputVariables.Count() + 1;
    64       return new DampenedLinearModel(PreconstructedLinearModel.CreateConfidenceLinearModel(pd, out rmse, out cvRmse), pd, Dampening);
     61      var res = (IConfidenceRegressionModel)new LinearLeaf().Build(pd, random, cancellationToken, out noParameters);
     62      return new DampenedLinearModel(res, pd, Dampening);
    6563    }
    6664
    6765    public int MinLeafSize(IRegressionProblemData pd) {
  • LeafTypes/M5Leaf.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Linq;
     25using System.Threading;
     26using HeuristicLab.Common;
     27using HeuristicLab.Core;
     28using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     29using HeuristicLab.Problems.DataAnalysis;
     30
     31namespace HeuristicLab.Algorithms.DataAnalysis {
     32  [StorableClass]
     33  [Item("M5Leaf", "A leaf type that uses linear models as leaf models. This is the standard for M5' regression")]
     34  public class M5Leaf : ParameterizedNamedItem, ILeafModel {
     35    #region Constructors & Cloning
     36    [StorableConstructor]
     37    private M5Leaf(bool deserializing) : base(deserializing) { }
     38    private M5Leaf(M5Leaf original, Cloner cloner) : base(original, cloner) { }
     39    public M5Leaf() { }
     40    public override IDeepCloneable Clone(Cloner cloner) {
     41      return new M5Leaf(this, cloner);
     42    }
     43    #endregion
     44
     45    #region IModelType
     46    public bool ProvidesConfidence {
     47      get { return false; }
     48    }
     49    public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) {
     50      if (pd.Dataset.Rows == 0) throw new ArgumentException("The number of training instances is too small to create an M5 leaf model");
     51
     52      if (pd.Dataset.Rows == 1)
     53        return new ConstantLeaf().Build(pd, random, cancellationToken, out noParameters);
     54
     55      var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Average());
     56      var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Variance());
     57      var used = pd.AllowedInputVariables.Where(v => !variances[v].IsAlmost(0.0)).ToList();
     58
     59      var classMean = pd.TargetVariableTrainingValues.Average();
     60      var classVar = pd.TargetVariableTrainingValues.Variance();
     61
     62      var model = FindBestModel(variances, means, classMean, classVar, pd, used);
     63      noParameters = 1 + model.Coefficients.Count;
     64      return model;
     65    }
     66
     67
     68    private static PreconstructedLinearModel FindBestModel(Dictionary<string, double> variances, Dictionary<string, double> means, double cMean, double cVar, IRegressionProblemData pd, IList<string> variables) {
     69      Dictionary<string, double> coeffs;
     70      double intercept;
     71      do {
     72        coeffs = DoRegression(pd, variables, variances, means, cMean, 1.0e-8, out intercept);
     73        variables = DeselectColinear(variances, coeffs, cVar, pd, variables);
     74      }
     75      while (coeffs.Count != variables.Count);
     76      var numAtts = variables.Count;
     77      var numInst = pd.TrainingIndices.Count();
     78      var fullMse = CalculateSE(coeffs, intercept, pd, variables);
     79      var akaike = 1.0 * (numInst - numAtts) + 2 * numAtts;
     80
     81      var improved = true;
     82      var currentNumAttributes = numAtts;
     83
     84      while (improved && currentNumAttributes > 1) {
     85        improved = false;
     86        currentNumAttributes--;
     87        // Find attribute with smallest SC
     88        var candidate = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / cVar)))
     89          .OrderBy(x => x.Value).Select(x => x.Key).First();
     90
     91        var currVariables = variables.Where(v => !v.Equals(candidate)).ToList();
     92        var currentIntercept = 0.0;
     93        var currentCoeffs = DoRegression(pd, currVariables, variances, means, cMean, 1.0e-8, out currentIntercept);
     94        var currentMse = CalculateSE(currentCoeffs, currentIntercept, pd, currVariables);
     95        var currentAkaike = currentMse / fullMse * (numInst - numAtts) + 2 * currentNumAttributes;
     96
     97        if (!(currentAkaike < akaike)) continue;
     98        improved = true;
     99        akaike = currentAkaike;
     100        coeffs = currentCoeffs;
     101        intercept = currentIntercept;
     102        variables = currVariables;
     103      }
     104
     105      var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable);
     106      pd2.TestPartition.End = pd.TestPartition.End;
     107      pd2.TestPartition.Start = pd.TestPartition.Start;
     108      pd2.TrainingPartition.End = pd.TrainingPartition.End;
     109      pd2.TrainingPartition.Start = pd.TrainingPartition.Start;
     110
     111      return new PreconstructedLinearModel(means, variances, coeffs, intercept, pd.TargetVariable);
     112    }
     113
     114
     115    private static Dictionary<string, double> DoRegression(IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> variances, Dictionary<string, double> means, double cmean, double ridge, out double intercept) {
     116      //if (pd.TrainingIndices.Count() > variables.Count) {
     117      //  var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable);
     118      //  pd2.TestPartition.End = pd.TestPartition.End;
     119      //  pd2.TestPartition.Start = pd.TestPartition.Start;
     120      //  pd2.TrainingPartition.End = pd.TrainingPartition.End;
     121      //  pd2.TrainingPartition.Start = pd.TrainingPartition.Start;
     122      //
     123      //  double x1, x2;
     124      //  var lm = PreconstructedLinearModel.CreateConfidenceLinearModel(pd2, out x1, out x2);
     125      //  intercept = lm.Intercept;
     126      //  return lm.Coefficients;
     127
     128      var n = variables.Count;
     129      var m = pd.TrainingIndices.Count();
     130
     131      var inTr = new double[n, m];
     132      for (var i = 0; i < n; i++) {
     133        var v = variables[i];
     134        var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray();
     135        var sd = Math.Sqrt(variances[v]);
     136        var mean = means[v];
     137        for (var j = 0; j < m; j++) {
     138          inTr[i, j] = (vdata[j] - mean) / sd;
     139        }
     140      }
     141
     142      var y = new double[m, 1];
     143      var ydata = pd.TargetVariableTrainingValues.ToArray();
     144      for (var i = 0; i < m; i++)
     145        y[i, 0] = ydata[i]; //no scaling for targets;
     146
     147
     148      var aTy = new double[n, 1];
     149      alglib.rmatrixgemm(n, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y;
     150      var aTa = new double[n, n];
     151      alglib.rmatrixgemm(n, n, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa //
     152
     153      var aTaDecomp = new double[n, n];
     154      bool success;
     155      var tries = 0;
     156      double[] coefficients = null;
     157      do {
     158        for (var i = 0; i < n; i++) aTa[i, i] += ridge; // add ridge to diagonal to enforce singularity
     159        try {
     160          //solve "aTa * coefficients = aTy" for coefficients;
     161          Array.Copy(aTa, 0, aTaDecomp, 0, aTa.Length);
     162          alglib.spdmatrixcholesky(ref aTaDecomp, n, true);
     163          int info;
     164          alglib.densesolverreport report;
     165          alglib.spdmatrixcholeskysolve(aTaDecomp, n, true, ydata, out info, out report, out coefficients);
     166
     167          if (info != 1) throw new Exception();
     168          success = true;
     169        }
     170        catch (Exception) {
     171          for (var i = 0; i < n; i++) aTa[i, i] -= ridge;
     172          ridge *= 10; // increase ridge;
     173          success = false;
     174        }
     175        finally {
     176          tries++;
     177        }
     178      }
     179      while (!success && tries < 100);
     180      if (coefficients == null) throw new ArgumentException("No linear model could be built");
     181
     182      intercept = cmean;
     183      var res = new Dictionary<string, double>();
     184      for (var i = 0; i < n; i++) {
     185        var v = variables[i];
     186        res.Add(v, coefficients[i] /= Math.Sqrt(variances[v]));
     187        intercept -= coefficients[i] * means[v];
     188      }
     189
     190      return res;
     191    }
     192
     193    private static IList<string> DeselectColinear(Dictionary<string, double> variances, Dictionary<string, double> coeffs, double cVar, IRegressionProblemData pd, IList<string> variables) {
     194      var candidates = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / cVar))).Where(x => x.Value > 1.5).OrderBy(x => -x.Value).ToList();
     195      if (candidates.Count == 0) return variables;
     196      var c = candidates.First().Key;
     197      return variables.Where(v => !v.Equals(c)).ToList();
     198    }
     199    private static double CalculateSE(Dictionary<string, double> coefficients, double intercept, IRegressionProblemData pd, IList<string> variables) {
     200      return pd.TrainingIndices.Select(i => RegressionPrediction(i, pd, variables, coefficients, intercept) - pd.Dataset.GetDoubleValue(pd.TargetVariable, i)).Select(error => error * error).Sum();
     201    }
     202    private static double RegressionPrediction(int i, IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> coefficients, double intercept) {
     203      return intercept + variables.Sum(v => pd.Dataset.GetDoubleValue(v, i) * coefficients[v]);
     204    }
     205    public int MinLeafSize(IRegressionProblemData pd) {
     206      return 1;
     207    }
     208    #endregion
     209  }
     210}
     211 No newline at end of file
  • LeafTypes/M5regLeaf.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Linq;
     24using System.Threading;
     25using HeuristicLab.Algorithms.DataAnalysis.Glmnet;
     26using HeuristicLab.Common;
     27using HeuristicLab.Core;
     28using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     29using HeuristicLab.Problems.DataAnalysis;
     30
     31namespace HeuristicLab.Algorithms.DataAnalysis {
     32  [StorableClass]
     33  [Item("M5regLeaf", "A leaf type that uses linear models as leaf models. This is the standard for M5' regression")]
     34  public class M5regLeaf : ParameterizedNamedItem, ILeafModel {
     35    #region Constructors & Cloning
     36    [StorableConstructor]
     37    private M5regLeaf(bool deserializing) : base(deserializing) { }
     38    private M5regLeaf(M5regLeaf original, Cloner cloner) : base(original, cloner) { }
     39    public M5regLeaf() { }
     40    public override IDeepCloneable Clone(Cloner cloner) {
     41      return new M5regLeaf(this, cloner);
     42    }
     43    #endregion
     44
     45    #region IModelType
     46    public bool ProvidesConfidence {
     47      get { return true; }
     48    }
     49    public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) {
     50      if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model");
     51      double rmse, cvRmse;
     52      noParameters = pd.AllowedInputVariables.Count() + 1;
     53
     54      double x1, x2;
     55      var coeffs = ElasticNetLinearRegression.CalculateModelCoefficients(pd, 1, 0.2, out x1, out x2);
     56      noParameters = coeffs.Length;
     57      return ElasticNetLinearRegression.CreateSymbolicSolution(coeffs, pd).Model;
     58    }
     59
     60    public int MinLeafSize(IRegressionProblemData pd) {
     61      return pd.AllowedInputVariables.Count() + 2;
     62    }
     63    #endregion
     64  }
     65}
     66 No newline at end of file
  • M5Regression.cs

     
    2121    #region Parametername
    2222    private const string GenerateRulesParameterName = "GenerateRules";
    2323    private const string HoldoutSizeParameterName = "HoldoutSize";
    24     private const string SpliterParameterName = "Spliter";
     24    private const string SpliterParameterName = "Splitter";
    2525    private const string MinimalNodeSizeParameterName = "MinimalNodeSize";
    2626    private const string LeafModelParameterName = "LeafModel";
    2727    private const string PruningTypeParameterName = "PruningType";
     
    3737    public IFixedValueParameter<PercentValue> HoldoutSizeParameter {
    3838      get { return (IFixedValueParameter<PercentValue>)Parameters[HoldoutSizeParameterName]; }
    3939    }
    40     public IConstrainedValueParameter<ISpliter> ImpurityParameter {
    41       get { return (IConstrainedValueParameter<ISpliter>)Parameters[SpliterParameterName]; }
     40    public IConstrainedValueParameter<ISplitter> ImpurityParameter {
     41      get { return (IConstrainedValueParameter<ISplitter>)Parameters[SpliterParameterName]; }
    4242    }
    4343    public IFixedValueParameter<IntValue> MinimalNodeSizeParameter {
    4444      get { return (IFixedValueParameter<IntValue>)Parameters[MinimalNodeSizeParameterName]; }
     
    6767    public double HoldoutSize {
    6868      get { return HoldoutSizeParameter.Value.Value; }
    6969    }
    70     public ISpliter Split {
     70    public ISplitter Split {
    7171      get { return ImpurityParameter.Value; }
    7272    }
    7373    public int MinimalNodeSize {
     
    9797    public M5Regression() {
    9898      var modelSet = new ItemSet<ILeafModel>(ApplicationManager.Manager.GetInstances<ILeafModel>());
    9999      var pruningSet = new ItemSet<IPruning>(ApplicationManager.Manager.GetInstances<IPruning>());
    100       var impuritySet = new ItemSet<ISpliter>(ApplicationManager.Manager.GetInstances<ISpliter>());
     100      var impuritySet = new ItemSet<ISplitter>(ApplicationManager.Manager.GetInstances<ISplitter>());
    101101      Parameters.Add(new FixedValueParameter<BoolValue>(GenerateRulesParameterName, "Whether a set of rules or a decision tree shall be created", new BoolValue(false)));
    102102      Parameters.Add(new FixedValueParameter<PercentValue>(HoldoutSizeParameterName, "How much of the training set shall be reserved for pruning", new PercentValue(0.2)));
    103       Parameters.Add(new ConstrainedValueParameter<ISpliter>(SpliterParameterName, "The type of split function used to create node splits", impuritySet, impuritySet.OfType<M5Spliter>().First()));
     103      Parameters.Add(new ConstrainedValueParameter<ISplitter>(SpliterParameterName, "The type of split function used to create node splits", impuritySet, impuritySet.OfType<M5Splitter>().First()));
    104104      Parameters.Add(new FixedValueParameter<IntValue>(MinimalNodeSizeParameterName, "The minimal number of samples in a leaf node", new IntValue(1)));
    105105      Parameters.Add(new ConstrainedValueParameter<ILeafModel>(LeafModelParameterName, "The type of model used for the nodes", modelSet, modelSet.OfType<LinearLeaf>().First()));
    106106      Parameters.Add(new ConstrainedValueParameter<IPruning>(PruningTypeParameterName, "The type of pruning used", pruningSet, pruningSet.OfType<M5LinearBottomUpPruning>().First()));
     
    119119      if (SetSeedRandomly) SeedParameter.Value.Value = new System.Random().Next();
    120120      random.Reset(Seed);
    121121      var solution = CreateM5RegressionSolution(Problem.ProblemData, random, LeafModel, Split, Pruning, UseHoldout, HoldoutSize, MinimalNodeSize, GenerateRules, Results, cancellationToken);
    122       AnalyzeSolution(solution);
     122      AnalyzeSolution(solution, Results, Problem.ProblemData);
    123123    }
    124124
    125125    #region Static Interface
    126126    public static IRegressionSolution CreateM5RegressionSolution(IRegressionProblemData problemData, IRandom random,
    127       ILeafModel leafModel = null, ISpliter spliter = null, IPruning pruning = null,
     127      ILeafModel leafModel = null, ISplitter splitter = null, IPruning pruning = null,
    128128      bool useHoldout = false, double holdoutSize = 0.2, int minNumInstances = 4, bool generateRules = false, ResultCollection results = null, CancellationToken? cancellationToken = null) {
    129129      //set default values
    130130      if (leafModel == null) leafModel = new LinearLeaf();
    131       if (spliter == null) spliter = new M5Spliter();
     131      if (splitter == null) splitter = new M5Splitter();
    132132      if (cancellationToken == null) cancellationToken = CancellationToken.None;
    133133      if (pruning == null) pruning = new M5LeafBottomUpPruning();
    134134
     135      //reduce RegressionProblemData to AllowedInput & Target column wise and to TrainingSet row wise
    135136      var doubleVars = new HashSet<string>(problemData.Dataset.DoubleVariables);
    136137      var vars = problemData.AllowedInputVariables.Concat(new[] {problemData.TargetVariable}).ToArray();
    137138      if (vars.Any(v => !doubleVars.Contains(v))) throw new NotSupportedException("M5 regression supports only double valued input or output features.");
    138 
    139       var values = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray();
    140       if (values.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x))))
     139      var doubles = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray();
     140      if (doubles.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x))))
    141141        throw new NotSupportedException("M5 regression does not support NaN or infinity values in the input dataset.");
    142 
    143       var trainingData = new Dataset(vars, values);
     142      var trainingData = new Dataset(vars, doubles);
    144143      var pd = new RegressionProblemData(trainingData, problemData.AllowedInputVariables, problemData.TargetVariable);
    145144      pd.TrainingPartition.End = pd.TestPartition.Start = pd.TestPartition.End = pd.Dataset.Rows;
    146145      pd.TrainingPartition.Start = 0;
    147146
    148       //create & build Model
    149       var m5Params = new M5Parameters(pruning, minNumInstances, leafModel, pd, random, spliter, results);
    150 
     147      //intialize M5Parameters and pruning set
     148      var m5Params = new M5Parameters(pruning, minNumInstances, leafModel, pd, random, splitter, results);
    151149      IReadOnlyList<int> trainingRows, pruningRows;
    152150      GeneratePruningSet(problemData.TrainingIndices.ToArray(), random, useHoldout, holdoutSize, out trainingRows, out pruningRows);
    153151
     152      //create & build Model
    154153      IM5Model model;
    155       if (generateRules)
    156         model = M5RuleSetModel.CreateRuleModel(problemData.TargetVariable, m5Params);
    157       else
    158         model = M5TreeModel.CreateTreeModel(problemData.TargetVariable, m5Params);
     154      if (generateRules) model = M5RuleSetModel.CreateRuleModel(problemData.TargetVariable, m5Params);
     155      else model = M5TreeModel.CreateTreeModel(problemData.TargetVariable, m5Params);
     156      model.Build(trainingRows, pruningRows, m5Params, cancellationToken.Value);
    159157
    160       model.Build(trainingRows, pruningRows, m5Params, cancellationToken.Value);
    161158      return model.CreateRegressionSolution(problemData);
    162159    }
    163160
    164     public static void UpdateM5Model(IRegressionModel model, IRegressionProblemData problemData, IRandom random,
    165       ILeafModel leafModel, CancellationToken? cancellationToken = null) {
    166       var m5Model = model as IM5Model;
    167       if (m5Model == null) throw new ArgumentException("This type of model can not be updated");
    168       UpdateM5Model(m5Model, problemData, random, leafModel, cancellationToken);
    169     }
    170 
    171     private static void UpdateM5Model(IM5Model model, IRegressionProblemData problemData, IRandom random,
    172       ILeafModel leafModel = null, CancellationToken? cancellationToken = null) {
     161    public static void UpdateM5Model(IM5Model model, IRegressionProblemData problemData, IRandom random, ILeafModel leafModel, CancellationToken? cancellationToken = null) {
    173162      if (cancellationToken == null) cancellationToken = CancellationToken.None;
    174163      var m5Params = new M5Parameters(leafModel, problemData, random);
    175164      model.Update(problemData.TrainingIndices.ToList(), m5Params, cancellationToken.Value);
     
    189178      training = perm.Take(cut).Select(i => allrows[i]).ToArray();
    190179    }
    191180
    192     private void AnalyzeSolution(IRegressionSolution solution) {
     181    private static void AnalyzeSolution(IRegressionSolution solution, ResultCollection Results, IRegressionProblemData problemData) {
    193182      Results.Add(new Result("RegressionSolution", (IItem)solution.Clone()));
    194183
    195       Dictionary<string, int> frequencies;
    196       if (!GenerateRules) {
    197         Results.Add(M5Analyzer.CreateLeafDepthHistogram((M5TreeModel)solution.Model));
    198         frequencies = M5Analyzer.GetTreeVariableFrequences((M5TreeModel)solution.Model);
     184      Dictionary<string, int> frequencies = null;
     185
     186      var tree = solution.Model as M5TreeModel;
     187      if (tree != null) {
     188        Results.Add(M5Analyzer.CreateLeafDepthHistogram(tree));
     189        frequencies = M5Analyzer.GetTreeVariableFrequences(tree);
     190        M5Analyzer.AnalyzeNodes(tree, Results, problemData);
    199191      }
    200       else {
    201         Results.Add(M5Analyzer.CreateRulesResult((M5RuleSetModel)solution.Model, Problem.ProblemData, "M5TreeResult", true));
    202         frequencies = M5Analyzer.GetRuleVariableFrequences((M5RuleSetModel)solution.Model);
    203         Results.Add(M5Analyzer.CreateCoverageDiagram((M5RuleSetModel)solution.Model, Problem.ProblemData));
     192
     193      var ruleSet = solution.Model as M5RuleSetModel;
     194      if (ruleSet != null) {
     195        Results.Add(M5Analyzer.CreateRulesResult(ruleSet, problemData, "M5Rules", true));
     196        frequencies = M5Analyzer.GetRuleVariableFrequences(ruleSet);
     197        Results.Add(M5Analyzer.CreateCoverageDiagram(ruleSet, problemData));
    204198      }
    205199
    206200      //Variable frequencies
    207       var sum = frequencies.Values.Sum();
    208       sum = sum == 0 ? 1 : sum;
    209       var impactArray = new DoubleArray(frequencies.Select(i => (double)i.Value / sum).ToArray()) {
    210         ElementNames = frequencies.Select(i => i.Key)
    211       };
    212       Results.Add(new Result("Variable Frequences", "relative frequencies of variables in rules and tree nodes", impactArray));
     201      if (frequencies != null) {
     202        var sum = frequencies.Values.Sum();
     203        sum = sum == 0 ? 1 : sum;
     204        var impactArray = new DoubleArray(frequencies.Select(i => (double)i.Value / sum).ToArray()) {
     205          ElementNames = frequencies.Select(i => i.Key)
     206        };
     207        Results.Add(new Result("Variable Frequences", "relative frequencies of variables in rules and tree nodes", impactArray));
     208      }
    213209    }
    214210    #endregion
    215211  }
  • M5Utilities/M5Analyzer.cs

     
    2222using System.Collections.Generic;
    2323using System.Linq;
    2424using HeuristicLab.Analysis;
     25using HeuristicLab.Common;
    2526using HeuristicLab.Data;
     27using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
    2628using HeuristicLab.Optimization;
     29using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
    2730using HeuristicLab.Problems.DataAnalysis;
     31using HeuristicLab.Problems.DataAnalysis.Symbolic;
    2832
    2933namespace HeuristicLab.Algorithms.DataAnalysis {
    30   internal static class M5Analyzer {
     34  public static class M5Analyzer {
    3135    private const string ConditionResultName = "Condition";
    3236    private const string CoverResultName = "Covered Instances";
    3337    private const string CoverageDiagramResultName = "Coverage";
     
    5256    public static Result CreateLeafDepthHistogram(M5TreeModel treeModel) {
    5357      var list = new List<int>();
    5458      GetLeafDepths(treeModel.Root, 0, list);
    55       var row = new DataRow("Depths", "", list.Select(x => (double) x)) {
     59      var row = new DataRow("Depths", "", list.Select(x => (double)x)) {
    5660        VisualProperties = {ChartType = DataRowVisualProperties.DataRowChartType.Histogram}
    5761      };
    5862      var hist = new DataTable("LeafDepths");
     
    130134          }
    131135      return res;
    132136    }
     137
     138    public static void AnalyzeNodes(M5TreeModel tree, ResultCollection results, IRegressionProblemData pd) {
     139      var dict = new Dictionary<int, M5NodeModel>();
     140      var modelNumber = new IntValue(1);
     141      var symtree = new SymbolicExpressionTree(MirrorTree(tree.Root, dict, modelNumber, pd.Dataset, pd.TrainingIndices.ToList()));
     142      results.AddOrUpdateResult("DecisionTree", symtree);
     143
     144      if (dict.Count > 200) return;
     145      var models = new ResultCollection();
     146      results.AddOrUpdateResult("NodeModels", models);
     147      foreach (var m in dict.Keys.OrderBy(x => x)) {
     148        models.AddOrUpdateResult("Model " + m, dict[m].CreateRegressionSolution(pd));
     149      }
     150    }
     151
     152    private static SymbolicExpressionTreeNode MirrorTree(M5NodeModel node, IDictionary<int, M5NodeModel> dict, IntValue nextId, IDataset data, IReadOnlyList<int> rows) {
     153      if (node.IsLeaf) {
     154        var i = nextId.Value++;
     155        dict.Add(i, node);
     156        return new SymbolicExpressionTreeNode(new TextSymbol("Model " + i + " " + rows.Count + " Instances"));
     157      }
     158
     159      var text = node.SplitAttribute + " <= " + node.SplitValue.ToString("0.###") + " pf= " + node.PruningStrength.ToString("0.###");
     160      var textNode = new SymbolicExpressionTreeNode(new TextSymbol(text));
     161      IReadOnlyList<int> lrows, rrows;
     162      M5StaticUtilities.SplitRows(rows, data, node.SplitAttribute, node.SplitValue, out lrows, out rrows);
     163
     164      textNode.AddSubtree(MirrorTree(node.Left, dict, nextId, data, lrows));
     165      textNode.AddSubtree(MirrorTree(node.Right, dict, nextId, data, rrows));
     166
     167      return textNode;
     168    }
     169
     170
     171    [StorableClass]
     172    private class TextSymbol : Symbol {
     173      [StorableConstructor]
     174      public TextSymbol(bool deserializing) : base(deserializing) { }
     175      public TextSymbol(Symbol original, Cloner cloner) : base(original, cloner) { }
     176      public TextSymbol(string name) : base(name, "") {
     177        this.Name = name;
     178      }
     179      public override IDeepCloneable Clone(Cloner cloner) {
     180        return new TextSymbol(this, cloner);
     181      }
     182      public override int MinimumArity {
     183        get { return 0; }
     184      }
     185      public override int MaximumArity {
     186        get { return int.MaxValue; }
     187      }
     188    }
    133189  }
    134190}
     191 No newline at end of file
  • M5Utilities/M5Parameters.cs

     
    2626using HeuristicLab.Problems.DataAnalysis;
    2727
    2828namespace HeuristicLab.Algorithms.DataAnalysis {
    29   internal class M5Parameters {
    30     private readonly ISpliter splitter;
     29  public class M5Parameters {
     30    private readonly ISplitter splitter;
    3131    private readonly IPruning pruning;
    3232    private readonly ILeafModel leafModel;
    3333    private readonly int minLeafSize;
     
    3434    private readonly IRegressionProblemData problemData;
    3535    private readonly IRandom random;
    3636    private readonly ResultCollection results;
    37     public ISpliter Spliter {
     37    public ISplitter Splitter {
    3838      get { return splitter; }
    3939    }
    4040    public IPruning Pruning {
     
    6666    }
    6767
    6868    public M5Parameters(IPruning pruning, int minleafSize, ILeafModel leafModel,
    69       IRegressionProblemData problemData, IRandom random, ISpliter splitter, ResultCollection results) {
     69      IRegressionProblemData problemData, IRandom random, ISplitter splitter, ResultCollection results) {
    7070      this.problemData = problemData;
    7171      this.random = random;
    7272      this.leafModel = leafModel;
  • M5Utilities/M5StaticUtilities.cs

     
    4242      }
    4343      if (alg.ExecutionState != ExecutionState.Paused) alg.Prepare();
    4444      alg.Start(cancellationToken);
    45       return alg.Results;
     45      var res = alg.Results;
     46      alg.Runs.Clear();
     47      return res;
    4648    }
    4749
    4850    public static void SplitRows(IReadOnlyList<int> rows, IDataset data, string splitAttr, double splitValue, out IReadOnlyList<int> leftRows, out IReadOnlyList<int> rightRows) {
    49       //TODO check and revert points at borders are now used multipe times
     51      //TODO check and revert?: points at borders are now used multipe times
    5052      var assignment = data.GetDoubleValues(splitAttr, rows).Select(x => x.IsAlmost(splitValue) ? 2 : x < splitValue ? 0 : 1).ToArray();
    5153      leftRows = rows.Zip(assignment, (i, b) => new {i, b}).Where(x => x.b == 0 || x.b == 2).Select(x => x.i).ToList();
    5254      rightRows = rows.Zip(assignment, (i, b) => new {i, b}).Where(x => x.b > 0).Select(x => x.i).ToList();
  • MetaModels/M5NodeModel.cs

     
    3131
    3232namespace HeuristicLab.Algorithms.DataAnalysis {
    3333  [StorableClass]
    34   internal class M5NodeModel : RegressionModel {
     34  public class M5NodeModel : RegressionModel {
    3535    #region Properties
     36    public double PruningStrength;
     37
    3638    [Storable]
    3739    internal bool IsLeaf { get; private set; }
    3840    [Storable]
     
    106108      SplitValue = double.NaN;
    107109      string attr;
    108110      double splitValue;
    109       IsLeaf = !m5Params.Spliter.Split(new RegressionProblemData(M5StaticUtilities.ReduceDataset(m5Params.Data, rows, variables, TargetVariable), variables, TargetVariable), m5Params.MinLeafSize, out attr, out splitValue);
     111      IsLeaf = !m5Params.Splitter.Split(new RegressionProblemData(M5StaticUtilities.ReduceDataset(m5Params.Data, rows, variables, TargetVariable), variables, TargetVariable), m5Params.MinLeafSize, out attr, out splitValue);
    110112      if (IsLeaf) return;
    111113
    112114      //split Dataset
  • MetaModels/M5RuleModel.cs

     
    3131
    3232namespace HeuristicLab.Algorithms.DataAnalysis {
    3333  [StorableClass]
    34   internal class M5RuleModel : RegressionModel {
     34  public class M5RuleModel : RegressionModel {
    3535    #region Properties
    3636    [Storable]
    3737    internal string[] SplitAttributes { get; private set; }
  • MetaModels/M5RuleSetModel.cs

     
    3131
    3232namespace HeuristicLab.Algorithms.DataAnalysis {
    3333  [StorableClass]
    34   internal class M5RuleSetModel : RegressionModel, IM5Model {
     34  public class M5RuleSetModel : RegressionModel, IM5Model {
    3535    private const string NumRulesResultName = "Number of rules";
    3636    private const string CoveredInstancesResultName = "Covered instances";
    3737
  • MetaModels/M5TreeModel.cs

     
    3131
    3232namespace HeuristicLab.Algorithms.DataAnalysis {
    3333  [StorableClass]
    34   internal class M5TreeModel : RegressionModel, IM5Model {
     34  public class M5TreeModel : RegressionModel, IM5Model {
    3535    public const string NumCurrentLeafsResultName = "Number of current leafs";
    3636    #region Properties
    3737    [Storable]
     
    6969
    7070    #region IM5Model
    7171    public void Build(IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) {
     72      //create intial (overfitted tree)
    7273      Root = M5NodeModel.CreateNode(m5Params.TargetVariable, m5Params);
    7374      Root.Split(trainingRows, m5Params);
    7475
    75       InitializeLeafCounter(m5Params);
     76      //intitalize leafs counter
     77      var leafs = Root.EnumerateNodes().Count(x => x.IsLeaf);
     78      if (!m5Params.Results.ContainsKey(NumCurrentLeafsResultName))
     79        m5Params.Results.Add(new Result(NumCurrentLeafsResultName, new IntValue(leafs)));
     80      else ((IntValue)m5Params.Results[NumCurrentLeafsResultName].Value).Value = leafs;
    7681
    77       var buPruner = m5Params.Pruning as BottomUpPruningBase;
    78       if (buPruner != null) buPruner.Prune(this, trainingRows, pruningRows, m5Params, cancellationToken);
     82      //prune
     83      m5Params.Pruning.Prune(this, trainingRows, pruningRows, m5Params, cancellationToken);
    7984
     85      //build final leaf models
    8086      Root.BuildLeafModels(trainingRows.Union(pruningRows).ToArray(), m5Params, cancellationToken);
    8187    }
    8288
     
    8591    }
    8692    #endregion
    8793
    88     #region Helpers
    89     private void InitializeLeafCounter(M5Parameters m5Params) {
    90       if (!m5Params.Results.ContainsKey(NumCurrentLeafsResultName))
    91         m5Params.Results.Add(new Result(NumCurrentLeafsResultName, new IntValue(Root.EnumerateNodes().Count(x => x.IsLeaf))));
    92       else ((IntValue)m5Params.Results[NumCurrentLeafsResultName].Value).Value = Root.EnumerateNodes().Count(x => x.IsLeaf);
    93     }
    94     #endregion
    95 
    9694    [StorableClass]
    9795    private class ConfidenceM5TreeModel : M5TreeModel, IConfidenceRegressionModel {
    9896      #region HLConstructors & Cloning
  • Pruning/BottomUpPruningBase.cs

     
    1919 */
    2020#endregion
    2121
     22using System;
    2223using System.Collections.Generic;
    2324using System.Linq;
    2425using System.Threading;
     
    4647    protected BottomUpPruningBase(bool deserializing) : base(deserializing) { }
    4748    protected BottomUpPruningBase(BottomUpPruningBase original, Cloner cloner) : base(original, cloner) { }
    4849    protected BottomUpPruningBase() {
    49       Parameters.Add(new FixedValueParameter<DoubleValue>(PruningStrengthParameterName, "The strength of the pruning. Higher values force the algorithm to create simpler models", new DoubleValue(4.0)));
     50      Parameters.Add(new FixedValueParameter<DoubleValue>(PruningStrengthParameterName, "The strength of the pruning. Higher values force the algorithm to create simpler models", new DoubleValue(2.0)));
    5051    }
    5152    #endregion
    5253
     
    5859    }
    5960    #endregion
    6061
    61     internal void Prune(M5TreeModel treeModel, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) {
    62       var globalStdDev = m5Params.Data.GetDoubleValues(m5Params.TargetVariable, trainingRows).StandardDeviationPop();
    63 
    64       Prune(treeModel.Root, trainingRows, pruningRows, m5Params, new Dictionary<M5NodeModel, int>(), new Dictionary<M5NodeModel, int>(), cancellationToken, globalStdDev);
     62    public void Prune(M5TreeModel treeModel, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) {
     63      Prune(treeModel.Root, trainingRows, pruningRows, m5Params, new Dictionary<M5NodeModel, int>(), new Dictionary<M5NodeModel, int>(), cancellationToken);
    6564    }
    6665
    6766    private bool Prune(M5NodeModel node, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params,
    6867      Dictionary<M5NodeModel, int> modelComplexities, Dictionary<M5NodeModel, int> nodeComplexities,
    69       CancellationToken cancellationToken, double globalStdDev) {
     68      CancellationToken cancellationToken) {
    7069      //build pruning model
    7170      int numModelParams;
    7271      var pruningModel = M5StaticUtilities.BuildModel(trainingRows, m5Params, PruningLeafModel(m5Params.LeafModel), cancellationToken, out numModelParams);
     
    7978      }
    8079
    8180      //split training & pruning data
    82       IReadOnlyList<int> leftTest, rightTest;
    83       M5StaticUtilities.SplitRows(pruningRows, m5Params.Data, node.SplitAttribute, node.SplitValue, out leftTest, out rightTest);
     81      IReadOnlyList<int> leftPruning, rightPruning;
     82      M5StaticUtilities.SplitRows(pruningRows, m5Params.Data, node.SplitAttribute, node.SplitValue, out leftPruning, out rightPruning);
    8483      IReadOnlyList<int> leftTraining, rightTraining;
    8584      M5StaticUtilities.SplitRows(trainingRows, m5Params.Data, node.SplitAttribute, node.SplitValue, out leftTraining, out rightTraining);
    8685
    8786      //prune children frist
    88       var lpruned = Prune(node.Left, leftTraining, leftTest, m5Params, modelComplexities, nodeComplexities, cancellationToken, globalStdDev);
    89       var rpruned = Prune(node.Right, rightTraining, rightTest, m5Params, modelComplexities, nodeComplexities, cancellationToken, globalStdDev);
     87      var lpruned = Prune(node.Left, leftTraining, leftPruning, m5Params, modelComplexities, nodeComplexities, cancellationToken);
     88      var rpruned = Prune(node.Right, rightTraining, rightPruning, m5Params, modelComplexities, nodeComplexities, cancellationToken);
    9089      nodeComplexities.Add(node, nodeComplexities[node.Left] + nodeComplexities[node.Right] + 1);
    9190
    9291      //TODO check if this reduces quality. It reduces training effort (consideraby for some pruningTypes)
     
    9392      if (!lpruned && !rpruned) return false;
    9493
    9594      //check if pruning will happen on this node
    96       if (!DecidePruneNode(node, m5Params, pruningRows, modelComplexities, nodeComplexities, globalStdDev)) return false;
     95      if (!DecidePruneNode(node, m5Params, pruningRows, modelComplexities, nodeComplexities)) return false;
    9796
    9897      //convert to leafNode
    9998      ((IntValue)m5Params.Results[M5TreeModel.NumCurrentLeafsResultName].Value).Value -= node.EnumerateNodes().Count(x => x.IsLeaf) - 1;
    10099
    101       //TODO chack wether removal is beneficial
    102100      nodeComplexities.Remove(node.Left);
    103101      nodeComplexities.Remove(node.Right);
    104102      modelComplexities.Remove(node.Left);
     
    110108    }
    111109
    112110    private bool DecidePruneNode(M5NodeModel node, M5Parameters m5Params, IReadOnlyCollection<int> testRows,
    113       IReadOnlyDictionary<M5NodeModel, int> modelComplexities, IReadOnlyDictionary<M5NodeModel, int> nodeComplexities,
    114       double globalStdDev) {
     111      IReadOnlyDictionary<M5NodeModel, int> modelComplexities, IReadOnlyDictionary<M5NodeModel, int> nodeComplexities) {
    115112      if (testRows.Count == 0) return true;
    116113
    117114      //create regressionProblemdata from pruning data
     
    123120
    124121      //evaluate combined sub nodes and pruning model
    125122      var rmsModel = node.Model.CreateRegressionSolution(pd).TestRootMeanSquaredError;
     123      rmsModel = rmsModel.IsAlmost(0.0) ? 0 : rmsModel;
    126124      var rmsSubTree = node.CreateRegressionSolution(pd).TestRootMeanSquaredError;
     125      rmsSubTree = rmsSubTree.IsAlmost(0.0) ? 0 : rmsSubTree;
    127126
     127      int rows = pd.Dataset.Rows;
     128      node.PruningStrength = rows * (rmsModel * (rows - nodeComplexities[node]) + rmsSubTree * (modelComplexities[node] - rows));
     129      node.PruningStrength /= rmsModel * modelComplexities[node] * (nodeComplexities[node] - rows) + rmsSubTree * nodeComplexities[node] * (rows - modelComplexities[node]);
     130
     131      var pf1 = PruningFactor(pd.Dataset.Rows, modelComplexities[node]);
     132      var pf2 = PruningFactor(pd.Dataset.Rows, nodeComplexities[node]);
    128133      //weigh, compare and decide
    129       var adjustedRmsModel = rmsModel * PruningFactor(pd.Dataset.Rows, modelComplexities[node]);
    130       var adjustedRmsTree = rmsSubTree * PruningFactor(pd.Dataset.Rows, nodeComplexities[node.Left] + nodeComplexities[node.Right] + 1);
    131       return adjustedRmsModel <= adjustedRmsTree;
     134      var adjustedRmsModel = rmsModel * pf1;
     135      var adjustedRmsTree = rmsSubTree * pf2;
     136
     137      return adjustedRmsModel < adjustedRmsTree;
    132138    }
    133139
    134140    private double PruningFactor(int noInstances, int noParams) {
    135       return noInstances <= noParams ? 10.0 : (noInstances + PruningStrength * noParams) / (noInstances - PruningStrength * noParams);
     141      //in the original M5 tree a cut off is used:
     142      if (noInstances <= noParams) return 10;
     143      //but to have at least some punishment for additional parameters, I would prefer: */
     144      if (noInstances <= noParams) return noInstances + PruningStrength * noParams;
     145
     146      return (noInstances + PruningStrength * noParams) / (noInstances - noParams);
    136147    }
    137148  }
    138149}
     150 No newline at end of file
  • Pruning/NoPruning.cs

     
    2020#endregion
    2121
    2222using System.Collections.Generic;
     23using System.Threading;
    2324using HeuristicLab.Common;
    2425using HeuristicLab.Core;
    2526using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     
    4041    public int MinLeafSize(IRegressionProblemData pd, ILeafModel leafModel) {
    4142      return 0;
    4243    }
     44
     45    public void Prune(M5TreeModel treeModel, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) { }
    4346    #endregion
    4447  }
    4548}
     49 No newline at end of file
  • Spliting/CorrelationImpuritiyCalculator.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Linq;
     25using HeuristicLab.Common;
     26
     27namespace HeuristicLab.Algorithms.DataAnalysis {
     28  /// <summary>
     29  /// Helper class for incremental split calculation.
     30  /// Used while moving a potential Spliter along the ordered training Instances
     31  /// </summary>
     32  internal class CorrelationImpuritiyCalculator {
     33    #region state
     34    //Data
     35    private readonly List<double> attributeValues;
     36    private readonly List<double> targetValues;
     37    private readonly double order;
     38    private readonly UnivariateOnlineLR left;
     39    private readonly UnivariateOnlineLR right;
     40    #endregion
     41
     42    #region Properties
     43    public double Impurity { get; private set; }
     44    public double SplitValue {
     45      get {
     46        if (left.Size <= 0) return double.NegativeInfinity;
     47        if (left.Size >= attributeValues.Count) return double.PositiveInfinity;
     48        return (attributeValues[left.Size - 1] + attributeValues[left.Size]) / 2;
     49      }
     50    }
     51    public bool ValidPosition {
     52      get { return !attributeValues[left.Size - 1].IsAlmost(attributeValues[left.Size]); }
     53    }
     54    public int LeftSize {
     55      get { return left.Size; }
     56    }
     57    #endregion
     58
     59    #region Constructors
     60    public CorrelationImpuritiyCalculator(int partition, IEnumerable<double> atts, IEnumerable<double> targets, double order) {
     61      if (order <= 0) throw new ArgumentException("Splitter order must be larger than 0");
     62      this.order = order;
     63      attributeValues = atts.ToList();
     64      targetValues = targets.ToList();
     65      left = new UnivariateOnlineLR(attributeValues.Take(partition).ToList(), targetValues.Take(partition).ToList());
     66      right = new UnivariateOnlineLR(attributeValues.Skip(partition).ToList(), targetValues.Skip(partition).ToList());
     67      UpdateImpurity();
     68    }
     69    #endregion
     70
     71    #region IImpurityCalculator
     72    public void Increment() {
     73      var target = targetValues[left.Size];
     74      var att = attributeValues[left.Size];
     75      left.Add(att, target);
     76      right.Remove(att, target);
     77      UpdateImpurity();
     78    }
     79    #endregion
     80
     81    private void UpdateImpurity() {
     82      var yl = Math.Pow(left.Ssr, 1.0 / order);
     83      var yr = Math.Pow(right.Ssr, 1.0 / order);
     84      if (left.Size > 1 && right.Size > 1) Impurity = -yl - yr;
     85      else Impurity = double.MinValue;
     86    }
     87  }
     88}
     89 No newline at end of file
  • Spliting/CorrelationSplitter.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Linq;
     25using HeuristicLab.Common;
     26using HeuristicLab.Core;
     27using HeuristicLab.Data;
     28using HeuristicLab.Parameters;
     29using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     30using HeuristicLab.Problems.DataAnalysis;
     31
     32namespace HeuristicLab.Algorithms.DataAnalysis {
     33  [StorableClass]
     34  [Item("CorrelationSplitter", "An experimental split selector that uses correlation coefficients")]
     35  public class CorrelationSplitter : ParameterizedNamedItem, ISplitter {
     36    public const string OrderParameterName = "Order";
     37    public IFixedValueParameter<DoubleValue> OrderParameter {
     38      get { return (IFixedValueParameter<DoubleValue>)Parameters[OrderParameterName]; }
     39    }
     40    public double Order {
     41      get { return OrderParameter.Value.Value; }
     42    }
     43
     44    #region Constructors & Cloning
     45    [StorableConstructor]
     46    private CorrelationSplitter(bool deserializing) { }
     47    private CorrelationSplitter(CorrelationSplitter original, Cloner cloner) : base(original, cloner) { }
     48    public CorrelationSplitter() {
     49      Parameters.Add(new FixedValueParameter<DoubleValue>(OrderParameterName, "The exponent in the split calculation ssrLeft^(1/Order)+ssrRight^(1/Order).", new DoubleValue(1)));
     50    }
     51    public override IDeepCloneable Clone(Cloner cloner) {
     52      return new CorrelationSplitter(this, cloner);
     53    }
     54    #endregion
     55
     56    #region ISplitType
     57    public bool Split(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue) {
     58      var bestSize = 0;
     59      var bestImpurity = double.MinValue;
     60      var bestSplitValue = 0.0;
     61      var bestSplitAttr = string.Empty;
     62      splitAttr = bestSplitAttr;
     63      splitValue = bestSplitValue;
     64      if (splitData.Dataset.Rows < minLeafSize * 2) return false;
     65
     66      //find best Attribute for the Splitter
     67      foreach (var attr in splitData.AllowedInputVariables) {
     68        int size;
     69        double impurity, sValue;
     70        var sortedData = splitData.Dataset.GetDoubleValues(attr).Zip(splitData.Dataset.GetDoubleValues(splitData.TargetVariable), Tuple.Create).OrderBy(x => x.Item1).ToArray();
     71        AttributeSplit(sortedData.Select(x => x.Item1).ToArray(), sortedData.Select(x => x.Item2).ToArray(), minLeafSize, out size, out impurity, out sValue);
     72        if (!(bestImpurity < impurity)) continue;
     73        bestImpurity = impurity;
     74        bestSize = size;
     75        bestSplitValue = sValue;
     76        bestSplitAttr = attr;
     77      }
     78
     79      splitAttr = bestSplitAttr;
     80      splitValue = bestSplitValue;
     81
     82      //if no suitable split exists => leafNode
     83      return bestSize >= minLeafSize && bestSize <= splitData.Dataset.Rows - minLeafSize;
     84    }
     85
     86    private void AttributeSplit(IReadOnlyList<double> attValues, IEnumerable<double> targetValues, int minLeafSize, out int leftSize, out double maxImpurity, out double splitValue) {
     87      leftSize = -1;
     88      splitValue = double.MinValue;
     89      maxImpurity = double.NegativeInfinity;
     90      var splitValues = new List<double>();
     91      var splitSizes = new List<int>();
     92      var length = attValues.Count;
     93
     94      var start = minLeafSize;
     95      while (attValues[start - 1].IsAlmost(attValues[start]) && start < length)
     96        start++;
     97      if (start >= length) return;
     98
     99      var imp = new CorrelationImpuritiyCalculator(minLeafSize, attValues, targetValues, Order);
     100      maxImpurity = imp.Impurity;
     101      splitValues.Add(imp.SplitValue);
     102      splitSizes.Add(imp.LeftSize);
     103
     104      while (imp.LeftSize < length - minLeafSize) {
     105        imp.Increment();
     106        if (!imp.ValidPosition) continue; //splits can not be made between to equal points
     107
     108        if (imp.Impurity.IsAlmost(maxImpurity)) {
     109          splitValues.Add(imp.SplitValue);
     110          splitSizes.Add(imp.LeftSize);
     111          continue;
     112        }
     113
     114        if (imp.Impurity < maxImpurity) continue;
     115        splitValues.Clear();
     116        splitSizes.Clear();
     117        maxImpurity = imp.Impurity;
     118        splitValues.Add(imp.SplitValue);
     119        splitSizes.Add(imp.LeftSize);
     120      }
     121
     122      var j = splitSizes.Count / 2;
     123      if (splitSizes.Count == 0) return;
     124      splitValue = splitValues[j];
     125      leftSize = splitSizes[j];
     126    }
     127    #endregion
     128  }
     129}
     130 No newline at end of file
  • Spliting/M5Splitter.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Linq;
     25using HeuristicLab.Common;
     26using HeuristicLab.Core;
     27using HeuristicLab.Data;
     28using HeuristicLab.Parameters;
     29using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     30using HeuristicLab.Problems.DataAnalysis;
     31
     32namespace HeuristicLab.Algorithms.DataAnalysis {
     33  [StorableClass]
     34  [Item("M5Splitter", "A split selector that uses the ratio between Variances^(1/Order) to determine good splits")]
     35  public class M5Splitter : ParameterizedNamedItem, ISplitter {
     36    public const string OrderParameterName = "Order";
     37    public IFixedValueParameter<DoubleValue> OrderParameter {
     38      get { return (IFixedValueParameter<DoubleValue>)Parameters[OrderParameterName]; }
     39    }
     40    public double Order {
     41      get { return OrderParameter.Value.Value; }
     42    }
     43
     44    #region Constructors & Cloning
     45    [StorableConstructor]
     46    private M5Splitter(bool deserializing) { }
     47    private M5Splitter(M5Splitter original, Cloner cloner) : base(original, cloner) { }
     48    public M5Splitter() {
     49      Parameters.Add(new FixedValueParameter<DoubleValue>(OrderParameterName, "The exponent in the split calculation sum (x_i - x_avg)^Order.", new DoubleValue(5)));
     50    }
     51    public override IDeepCloneable Clone(Cloner cloner) {
     52      return new M5Splitter(this, cloner);
     53    }
     54    #endregion
     55
     56    #region ISplitType
     57    public bool Split(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue) {
     58      var bestPos = 0;
     59      var bestImpurity = double.MinValue;
     60      var bestSplitValue = 0.0;
     61      var bestSplitAttr = string.Empty;
     62      splitAttr = bestSplitAttr;
     63      splitValue = bestSplitValue;
     64      if (splitData.Dataset.Rows < minLeafSize) return false;
     65      //find best Attribute for the Splitter
     66      foreach (var attr in splitData.AllowedInputVariables) {
     67        int pos;
     68        double impurity, sValue;
     69        var sortedData = splitData.Dataset.GetDoubleValues(attr).Zip(splitData.Dataset.GetDoubleValues(splitData.TargetVariable), Tuple.Create).OrderBy(x => x.Item1).ToArray();
     70        AttributeSplit(sortedData.Select(x => x.Item1).ToArray(), sortedData.Select(x => x.Item2).ToArray(), minLeafSize, out pos, out impurity, out sValue);
     71        if (!(bestImpurity < impurity)) continue;
     72        bestImpurity = impurity;
     73        bestPos = pos;
     74        bestSplitValue = sValue;
     75        bestSplitAttr = attr;
     76      }
     77
     78      splitAttr = bestSplitAttr;
     79      splitValue = bestSplitValue;
     80      //if no suitable split exists => leafNode
     81      return bestPos + 1 >= minLeafSize && bestPos <= splitData.Dataset.Rows - minLeafSize;
     82    }
     83
     84    private void AttributeSplit(IReadOnlyList<double> attValues, IReadOnlyList<double> targetValues, int minLeafSize, out int position, out double maxImpurity, out double splitValue) {
     85      position = 0;
     86      maxImpurity = -1E20;
     87      splitValue = 0.0;
     88      var length = targetValues.Count;
     89
     90
     91      // weka code
     92      var low = 0;
     93      var high = length - 1;
     94      if (high - low + 1 < 4) return;
     95      var len = Math.Max(minLeafSize - 1, high - low + 1 < 5 ? 1 : (high - low + 1) / 5);
     96      position = low;
     97      var part = low + len - 1;
     98      var imp = new OrderImpurityCalculator(part + 1, targetValues, Order);
     99
     100
     101      //if (imp.Impurity > maxImpurity && !attValues[part - 1].IsAlmost(attValues[part])) {
     102      //  maxImpurity = imp.Impurity;
     103      //  splitValue = (attValues[part - 1] + attValues[part]) / 2;
     104      //  position = part;
     105      //}
     106
     107      for (var i = low + len; i < high - len; i++) {
     108        imp.Increment(targetValues[i], OrderImpurityCalculator.IncrementType.Left);
     109        if (attValues[i].IsAlmost(attValues[i + 1])) continue; //splits can not be made between to equal points
     110        if (imp.Impurity < maxImpurity) continue;
     111        maxImpurity = imp.Impurity;
     112        splitValue = (attValues[i] + attValues[i + 1]) / 2;
     113        position = i;
     114      }
     115    }
     116    #endregion
     117  }
     118}
     119 No newline at end of file
  • Spliting/NeumaierSum.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Linq;
     25using System.Runtime.CompilerServices;
     26using HeuristicLab.Common;
     27
     28namespace HeuristicLab.Algorithms.DataAnalysis {
     29  /// <summary>
     30  /// Helper class for incremental split calculation.
     31  /// Used while moving a potential Splitter along the ordered training Instances
     32  /// </summary>
     33  internal class NeumaierSum {
     34    #region state
     35    private double sum;
     36    private double correction;
     37    #endregion
     38
     39    #region Constructors
     40    public NeumaierSum(double startvalue) {
     41      sum = startvalue;
     42      correction = 0;
     43    }
     44    #endregion
     45
     46    [MethodImpl(MethodImplOptions.NoOptimization)]
     47    public void Add(double value) {
     48      var t = sum + value;
     49      var absSum = sum > 0 ? sum : -sum;
     50      var absv = value > 0 ? value : -value;
     51      if (absSum >= absv)
     52        correction += (sum - t) + value;
     53      else
     54        correction += (value - t) + sum;
     55      sum = t;
     56    }
     57
     58    public double Get() {
     59      return sum + correction;
     60    }
     61
     62    public void Mul(double value) {
     63      sum *= value;
     64      correction *= value;
     65    }
     66  }
     67}
     68 No newline at end of file
  • Spliting/OptimumSearchingSplitter.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Linq;
     25using HeuristicLab.Common;
     26using HeuristicLab.Core;
     27using HeuristicLab.Data;
     28using HeuristicLab.Parameters;
     29using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
     30using HeuristicLab.Problems.DataAnalysis;
     31
     32namespace HeuristicLab.Algorithms.DataAnalysis {
     33  [StorableClass]
     34  [Item("OptimumSearchingSplitter", "A split selector that favours higher resolution splits near percieved optima.\n Decribed in \"Model-Based Genetic Algorithms for Algorithm Configuration\" by Carlos Ansotegui et al ")]
     35  public class OptimumSearchingSplitter : ParameterizedNamedItem, ISplitter {
     36    public const string SearchStrengthParameterName = "Search Strength";
     37    public const string MaximizationParamterName = "Maximization";
     38    public const string OrderParameterName = "Order";
     39    public IFixedValueParameter<DoubleValue> OrderParameter {
     40      get { return (IFixedValueParameter<DoubleValue>)Parameters[OrderParameterName]; }
     41    }
     42    public IFixedValueParameter<PercentValue> SearchStrengthParameter {
     43      get { return (IFixedValueParameter<PercentValue>)Parameters[SearchStrengthParameterName]; }
     44    }
     45    public IFixedValueParameter<BoolValue> MaximizationParameter {
     46      get { return (IFixedValueParameter<BoolValue>)Parameters[MaximizationParamterName]; }
     47    }
     48    public double Order {
     49      get { return OrderParameter.Value.Value; }
     50    }
     51    public double SearchStrength {
     52      get { return SearchStrengthParameter.Value.Value; }
     53    }
     54    public bool Maximization {
     55      get { return MaximizationParameter.Value.Value; }
     56    }
     57
     58    #region Constructors & Cloning
     59    [StorableConstructor]
     60    private OptimumSearchingSplitter(bool deserializing) { }
     61    private OptimumSearchingSplitter(OptimumSearchingSplitter original, Cloner cloner) : base(original, cloner) { }
     62    public OptimumSearchingSplitter() {
     63      Parameters.Add(new FixedValueParameter<DoubleValue>(OrderParameterName, "The exponent in the split calculation sum (x_i - x_avg)^Order.", new DoubleValue(2)));
     64      Parameters.Add(new FixedValueParameter<PercentValue>(SearchStrengthParameterName, "How strong the spliting process should be skewed towards/away from the percieved optimum", new PercentValue(0.10)));
     65      Parameters.Add(new FixedValueParameter<BoolValue>(MaximizationParamterName, "Whether the splitting procedure should asume a minimization or maximization procedure."));
     66    }
     67    public override IDeepCloneable Clone(Cloner cloner) {
     68      return new OptimumSearchingSplitter(this, cloner);
     69    }
     70    #endregion
     71
     72    #region ISplitType
     73    public bool Split(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue) {
     74      var bestImpurity = double.MinValue;
     75      var bestSplitValue = 0.0;
     76      var bestSplitAttr = string.Empty;
     77      var bestSize = 0;
     78      splitAttr = bestSplitAttr;
     79      splitValue = bestSplitValue;
     80
     81      var targets = splitData.TargetVariableValues.ToArray();
     82      var vh = targets.Quantile(Maximization ? 1 - SearchStrength : SearchStrength);
     83      var lower = new HashSet<int>(targets.Select((x, i) => new {x, i}).Where(e => e.x < vh).Select(e => e.i));
     84
     85      if (splitData.Dataset.Rows < minLeafSize) return false;
     86
     87      foreach (var attr in splitData.AllowedInputVariables) {
     88        int pos;
     89        double impurity, sValue;
     90        AttributeSplit(splitData, lower, attr, vh, minLeafSize, out pos, out impurity, out sValue);
     91        if (!(bestImpurity < impurity)) continue;
     92        bestImpurity = impurity;
     93        bestSplitValue = sValue;
     94        bestSplitAttr = attr;
     95        bestSize = pos;
     96      }
     97
     98      splitAttr = bestSplitAttr;
     99      splitValue = bestSplitValue;
     100
     101      //if no suitable split exists => leafNode
     102      return bestSize >= minLeafSize && bestSize <= splitData.Dataset.Rows - minLeafSize;
     103    }
     104
     105    private void AttributeSplit(IRegressionProblemData splitData, ICollection<int> t, string attribute, double vh, int minLeafSize, out int leftSize, out double maxImpurity, out double splitValue) {
     106      leftSize = 0;
     107      maxImpurity = -1E20;
     108      splitValue = 0.0;
     109      var length = splitData.Dataset.Rows;
     110
     111      double lls = 0, rls = 0, lts = 0, rts = 0;
     112      int ltn = 0, rtn = t.Count;
     113
     114      var points = splitData.Dataset.GetDoubleValues(attribute).Select((x, i) => new {x, i}).OrderBy(e => e.x).ToArray();
     115      for (var i = 0; i < length - minLeafSize; i++) {
     116        var point = points[i];
     117        var con = Contibution(splitData, vh, point.i);
     118        //move contribution to and from respcetive sums
     119        if (t.Contains(i)) {
     120          lls += con;
     121          rls -= con;
     122        }
     123        else {
     124          ltn++;
     125          rtn--;
     126          lts += con;
     127          rts -= con;
     128        }
     129
     130        //splits can not be made between to equal points
     131        if (point.x.IsAlmost(points[i + 1].x)) continue;
     132
     133        //calculate impurity / score
     134        var al = (ltn + lts) / (1 + lls);
     135        var ar = (rtn + rts) / (1 + rls);
     136        var impurity = ltn > rtn ? al : ltn < rtn ? ar : Math.Min(al, ar);
     137
     138        if (i < minLeafSize || impurity < maxImpurity) continue;
     139        maxImpurity = impurity;
     140        splitValue = (point.x + points[i + 1].x) / 2;
     141        leftSize = i + 1;
     142      }
     143    }
     144
     145    private double Contibution(IRegressionProblemData splitData, double vh, int i) {
     146      var v = splitData.Dataset.GetDoubleValue(splitData.TargetVariable, i) - vh;
     147      return Math.Pow(v, Order);
     148    }
     149    #endregion
     150  }
     151}
     152 No newline at end of file
  • Spliting/OrderImpurityCalculator.cs

     
    2727namespace HeuristicLab.Algorithms.DataAnalysis {
    2828  /// <summary>
    2929  /// Helper class for incremental split calculation.
    30   /// Used while moving a potential Spliter along the ordered training Instances
     30  /// Used while moving a potential Splitter along the ordered training Instances
    3131  /// </summary>
    3232  internal class OrderImpurityCalculator {
    3333    internal enum IncrementType {
     
    104104      VarLeft = NoLeft <= 0 ? 0 : Math.Abs(NoLeft * SqSumLeft - SumLeft * SumLeft) / (NoLeft * NoLeft);
    105105      VarRight = NoRight <= 0 ? 0 : Math.Abs(NoRight * SqSumRight - SumRight * SumRight) / (NoRight * NoRight);
    106106
    107       if (Order <= 0) throw new ArgumentException("Spliter order must be larger than 0");
     107      if (Order <= 0) throw new ArgumentException("Splitter order must be larger than 0");
    108108      if (Order.IsAlmost(1)) {
    109109        y = VarTotal;
    110110        yl = VarLeft;
     
    115115        yl = Math.Pow(VarLeft, 1.0 / Order);
    116116        yr = Math.Pow(VarRight, 1.0 / Order);
    117117      }
    118       var t = NoRight + NoLeft;
    119       if (NoLeft <= 0.0 || NoRight <= 0.0) Impurity = double.MinValue; //Spliter = 0;
    120       else Impurity = y - NoLeft / t * yl - NoRight / t * yr; //  Spliter = y - NoLeft / NoRight * yl - NoRight / NoLeft * yr
     118      if (NoLeft <= 0.0 || NoRight <= 0.0) Impurity = double.MinValue; //Splitter = 0;
     119      else Impurity = y - (NoLeft * yl + NoRight * yr) / (NoRight + NoLeft);
    121120    }
    122121    #endregion
    123122  }
  • Spliting/UnivariateOnlineLR.cs

     
     1#region License Information
     2/* HeuristicLab
     3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     4 *
     5 * This file is part of HeuristicLab.
     6 *
     7 * HeuristicLab is free software: you can redistribute it and/or modify
     8 * it under the terms of the GNU General Public License as published by
     9 * the Free Software Foundation, either version 3 of the License, or
     10 * (at your option) any later version.
     11 *
     12 * HeuristicLab is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     15 * GNU General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU General Public License
     18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
     19 */
     20#endregion
     21
     22using System;
     23using System.Collections.Generic;
     24using System.Linq;
     25using HeuristicLab.Common;
     26
     27namespace HeuristicLab.Algorithms.DataAnalysis {
     28  /// <summary>
     29  /// Helper class for incremental split calculation.
     30  /// Used while moving a potential Spliter along the ordered training Instances
     31  /// </summary>
     32  internal class UnivariateOnlineLR {
     33    #region state
     34    private readonly NeumaierSum targetMean;
     35    private readonly NeumaierSum attributeMean;
     36    private readonly NeumaierSum targetVarSum;
     37    private readonly NeumaierSum attributeVarSum;
     38    private readonly NeumaierSum comoment;
     39    private readonly NeumaierSum ssr;
     40    private int size;
     41    #endregion
     42
     43    public double Ssr {
     44      get { return ssr.Get(); }
     45    }
     46    public int Size {
     47      get { return size; }
     48    }
     49
     50    private double Beta {
     51      get { return comoment.Get() / attributeVarSum.Get(); }
     52    }
     53
     54    private double Alpha {
     55      get { return targetMean.Get() - Beta * attributeMean.Get(); }
     56    }
     57
     58    public UnivariateOnlineLR(ICollection<double> attributeValues, ICollection<double> targetValues) {
     59      if (attributeValues.Count != targetValues.Count) throw new ArgumentException("Targets and Attributes need to have the same length");
     60      size = attributeValues.Count;
     61
     62      var yMean = targetValues.Average();
     63      var xMean = attributeValues.Average();
     64      targetMean = new NeumaierSum(yMean);
     65      attributeMean = new NeumaierSum(xMean);
     66      targetVarSum = new NeumaierSum(targetValues.VariancePop() * size);
     67      attributeVarSum = new NeumaierSum(attributeValues.VariancePop() * size);
     68      comoment = new NeumaierSum(attributeValues.Zip(targetValues, (x, y) => (x - xMean) * (y - yMean)).Sum());
     69
     70      var beta = comoment.Get() / attributeVarSum.Get();
     71      var alpha = yMean - beta * xMean;
     72      ssr = new NeumaierSum(attributeValues.Zip(targetValues, (x, y) => y - alpha - beta * x).Sum(x => x * x));
     73    }
     74
     75    public void Add(double attributeValue, double targetValue) {
     76      var predictOld = Predict(attributeValue, targetValue);
     77
     78      size++;
     79      var dx = attributeValue - attributeMean.Get();
     80      var dy = targetValue - targetMean.Get();
     81      attributeMean.Add(dx / size);
     82      targetMean.Add(dy / size);
     83      var dx2 = attributeValue - attributeMean.Get();
     84      var dy2 = targetValue - targetMean.Get();
     85      attributeVarSum.Add(dx * dx2);
     86      targetVarSum.Add(dy * dy2);
     87      comoment.Add(dx * dy2);
     88
     89      ssr.Add(predictOld * Predict(attributeValue, targetValue));
     90    }
     91
     92    public void Remove(double attributeValue, double targetValue) {
     93      var predictOld = Predict(attributeValue, targetValue);
     94
     95      var dx2 = attributeValue - attributeMean.Get();
     96      var dy2 = targetValue - targetMean.Get();
     97      attributeMean.Mul(size / (size - 1.0));
     98      targetMean.Mul(size / (size - 1.0));
     99      attributeMean.Add(-attributeValue / (size - 1.0));
     100      targetMean.Add(-targetValue / (size - 1.0));
     101      var dx = attributeValue - attributeMean.Get();
     102      var dy = targetValue - targetMean.Get();
     103      attributeVarSum.Add(-dx * dx2);
     104      targetVarSum.Add(-dy * dy2);
     105      comoment.Add(-dx * dy2);
     106      size--;
     107
     108      ssr.Add(-predictOld * Predict(attributeValue, targetValue));
     109    }
     110
     111    private double Predict(double attributeValue, double targetValue) {
     112      return targetValue - Alpha - Beta * attributeValue;
     113    }
     114  }
     115}
     116 No newline at end of file
Note: See TracBrowser for help on using the repository browser.