1  #region License Information


2  /* HeuristicLab


3  * Copyright (C) 20022015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)


4  * and the BEACON Center for the Study of Evolution in Action.


5  *


6  * This file is part of HeuristicLab.


7  *


8  * HeuristicLab is free software: you can redistribute it and/or modify


9  * it under the terms of the GNU General Public License as published by


10  * the Free Software Foundation, either version 3 of the License, or


11  * (at your option) any later version.


12  *


13  * HeuristicLab is distributed in the hope that it will be useful,


14  * but WITHOUT ANY WARRANTY; without even the implied warranty of


15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the


16  * GNU General Public License for more details.


17  *


18  * You should have received a copy of the GNU General Public License


19  * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.


20  */


21  #endregion


22 


23  using System;


24  using System.Collections.Generic;


25  using System.Diagnostics.Contracts;


26  using System.Linq;


27  using GradientBoostedTrees;


28  using HeuristicLab.Problems.DataAnalysis;


29  using HeuristicLab.Random;


30 


31  namespace HeuristicLab.Algorithms.DataAnalysis {


32  public static class GradientBoostedTreesAlgorithmStatic {


33  #region static API


34 


35  public interface IGbmState {


36 


37  IRegressionModel GetModel();


38  double GetTrainLoss();


39  double GetTestLoss();


40  IEnumerable<KeyValuePair<string, double>> GetVariableRelevance();


41  }


42 


43  // created through factory method


44  private class GbmState : IGbmState {


45  internal IRegressionProblemData problemData { get; set; }


46  internal MersenneTwister random { get; set; }


47  internal ILossFunction lossFunction { get; set; }


48  internal int maxDepth { get; set; }


49  internal double nu { get; set; }


50  internal double r { get; set; }


51  internal double m { get; set; }


52  internal RegressionTreeBuilder treeBuilder;


53 


54 


55  // array members (allocate only once)


56  internal double[] pred;


57  internal double[] predTest;


58  internal double[] w;


59  internal double[] y;


60  internal int[] activeIdx;


61  internal double[] rim;


62 


63  internal IList<IRegressionModel> models;


64  internal IList<double> weights;


65 


66  public GbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxDepth, double r, double m, double nu) {


67  // default settings for MaxDepth, Nu and R


68  this.maxDepth = maxDepth;


69  this.nu = nu;


70  this.r = r;


71  this.m = m;


72 


73  random = new MersenneTwister(randSeed);


74  this.problemData = problemData;


75  this.lossFunction = lossFunction;


76 


77  int nRows = problemData.TrainingIndices.Count();


78 


79  y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray();


80  // weights are all 1 for now (HL doesn't support weights yet)


81  w = Enumerable.Repeat(1.0, nRows).ToArray();


82 


83  treeBuilder = new RegressionTreeBuilder(problemData, random);


84 


85  activeIdx = Enumerable.Range(0, nRows).ToArray();


86 


87  // prepare arrays (allocate only once)


88  double f0 = y.Average(); // default prediction (constant)


89  pred = Enumerable.Repeat(f0, nRows).ToArray();


90  predTest = Enumerable.Repeat(f0, problemData.TestIndices.Count()).ToArray();


91  rim = new double[nRows];


92 


93  models = new List<IRegressionModel>();


94  weights = new List<double>();


95  // add constant model


96  models.Add(new ConstantRegressionModel(f0));


97  weights.Add(1.0);


98  }


99 


100  public IRegressionModel GetModel() {


101  return new GradientBoostedTreesModel(models, weights);


102  }


103  public IEnumerable<KeyValuePair<string, double>> GetVariableRelevance() {


104  return treeBuilder.GetVariableRelevance();


105  }


106 


107  public double GetTrainLoss() {


108  int nRows = y.Length;


109  return lossFunction.GetLoss(y, pred, w) / nRows;


110  }


111  public double GetTestLoss() {


112  var yTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices);


113  var wTest = yTest.Select(_ => 1.0); // ones


114  var nRows = yTest.Count();


115  return lossFunction.GetLoss(yTest, predTest, wTest) / nRows;


116  }


117  }


118 


119  // simple interface


120  public static IRegressionSolution TrainGbm(IRegressionProblemData problemData, int maxDepth, double nu, double r, int maxIterations) {


121  return TrainGbm(problemData, new SquaredErrorLoss(), maxDepth, nu, r, maxIterations);


122  }


123 


124  // simple interface


125  public static IRegressionSolution TrainGbm(IRegressionProblemData problemData, ILossFunction lossFunction,


126  int maxDepth, double nu, double r, int maxIterations, uint randSeed = 31415) {


127  Contract.Assert(r > 0);


128  Contract.Assert(r <= 1.0);


129  Contract.Assert(nu > 0);


130  Contract.Assert(nu <= 1.0);


131 


132  var state = (GbmState)CreateGbmState(problemData, lossFunction, randSeed);


133  state.maxDepth = maxDepth;


134  state.r = r;


135  state.nu = nu;


136 


137  for (int iter = 0; iter < maxIterations; iter++) {


138  MakeStep(state);


139  }


140 


141  var model = new GradientBoostedTreesModel(state.models, state.weights);


142  return new RegressionSolution(model, (IRegressionProblemData)problemData.Clone());


143  }


144 


145  // for custom stepping & termination


146  public static IGbmState CreateGbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxDepth = 3, double r = 0.66, double m = 0.5, double nu = 0.01) {


147  return new GbmState(problemData, lossFunction, randSeed, maxDepth, r, m, nu);


148  }


149 


150  // use default settings for maxDepth, nu, r from state


151  public static void MakeStep(IGbmState state) {


152  var gbmState = state as GbmState;


153  if (gbmState == null) throw new ArgumentException("state");


154 


155  MakeStep(gbmState, gbmState.maxDepth, gbmState.nu, gbmState.r, gbmState.m);


156  }


157 


158  // allow dynamic adaptation of maxDepth, nu and r


159  public static void MakeStep(IGbmState state, int maxDepth, double nu, double r, double m) {


160  var gbmState = state as GbmState;


161  if (gbmState == null) throw new ArgumentException("state");


162 


163  var problemData = gbmState.problemData;


164  var lossFunction = gbmState.lossFunction;


165  var yPred = gbmState.pred;


166  var yPredTest = gbmState.predTest;


167  var w = gbmState.w;


168  var treeBuilder = gbmState.treeBuilder;


169  var y = gbmState.y;


170  var activeIdx = gbmState.activeIdx;


171  var rim = gbmState.rim;


172 


173  // copy output of gradient function to preallocated rim array (pseudoresiduals)


174  int rimIdx = 0;


175  foreach (var g in lossFunction.GetLossGradient(y, yPred, w)) {


176  rim[rimIdx++] = g;


177  }


178 


179  var tree = treeBuilder.CreateRegressionTreeForGradientBoosting(rim, maxDepth, activeIdx, lossFunction.GetLineSearchFunc(y, yPred, w), r, m);


180 


181  int i = 0;


182  foreach (var pred in tree.GetEstimatedValues(problemData.Dataset, problemData.TrainingIndices)) {


183  yPred[i] = yPred[i] + nu * pred;


184  i++;


185  }


186  // update predictions for validation set


187  i = 0;


188  foreach (var pred in tree.GetEstimatedValues(problemData.Dataset, problemData.TestIndices)) {


189  yPredTest[i] = yPredTest[i] + nu * pred;


190  i++;


191  }


192 


193  gbmState.weights.Add(nu);


194  gbmState.models.Add(tree);


195  }


196  #endregion


197  }


198  }

