1  #region License Information


2  /* HeuristicLab


3  * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)


4  * and the BEACON Center for the Study of Evolution in Action.


5  *


6  * This file is part of HeuristicLab.


7  *


8  * HeuristicLab is free software: you can redistribute it and/or modify


9  * it under the terms of the GNU General Public License as published by


10  * the Free Software Foundation, either version 3 of the License, or


11  * (at your option) any later version.


12  *


13  * HeuristicLab is distributed in the hope that it will be useful,


14  * but WITHOUT ANY WARRANTY; without even the implied warranty of


15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the


16  * GNU General Public License for more details.


17  *


18  * You should have received a copy of the GNU General Public License


19  * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.


20  */


21  #endregion


22 


23  using System;


24  using System.Collections.Generic;


25  using System.Diagnostics;


26  using System.Linq;


27  using HeuristicLab.Core;


28  using HeuristicLab.Problems.DataAnalysis;


29 


30  namespace HeuristicLab.Algorithms.DataAnalysis {


31  // This class implements a greedy decision tree learner which selects splits with the maximum reduction in sum of squared errors.


32  // The tree builder also tracks variable relevance metrics based on the splits and improvement after the split.


33  // The implementation is tuned for gradient boosting where multiple trees have to be calculated for the same training data


34  // each time with a different target vector. Vectors of idx to allow iteration of intput variables in sorted order are


35  // precalculated so that optimal thresholds for splits can be calculated in O(n) for each input variable.


36  // After each split the row idx are partitioned in a left an right part.


37  internal class RegressionTreeBuilder {


38  private readonly IRandom random;


39  private readonly IRegressionProblemData problemData;


40 


41  private readonly int nCols;


42  private readonly double[][] x; // all training data (original order from problemData), x is constant


43  private double[] originalY; // the original target labels (from problemData), originalY is constant


44  private double[] curPred; // current predictions for originalY (in case we are using gradient boosting, otherwise = zeros), only necessary for line search


45 


46  private double[] y; // training labels (original order from problemData), y can be changed


47 


48  private Dictionary<string, double> sumImprovements; // for variable relevance calculation


49 


50  private readonly string[] allowedVariables; // all variables in shuffled order


51  private Dictionary<string, int> varName2Index; // maps the variable names to column indexes


52  private int effectiveVars; // number of variables that are used from allowedVariables


53 


54  private int effectiveRows; // number of rows that are used from


55  private readonly int[][] sortedIdxAll;


56  private readonly int[][] sortedIdx; // random selection from sortedIdxAll (for r < 1.0)


57 


58  // helper arrays which are allocated to maximal necessary size only once in the ctor


59  private readonly int[] internalIdx, which, leftTmp, rightTmp;


60  private readonly double[] outx;


61  private readonly int[] outSortedIdx;


62 


63  private RegressionTreeModel.TreeNode[] tree; // tree is represented as a flat array of nodes


64  private int curTreeNodeIdx; // the index where the next tree node is stored


65 


66  // This class represents information about potential splits.


67  // For each node generated the best splitting variable and threshold as well as


68  // the improvement from the split are stored in a priority queue


69  private class PartitionSplits {


70  public int ParentNodeIdx { get; set; } // the idx of the leaf node representing this partition


71  public int StartIdx { get; set; } // the start idx of the partition


72  public int EndIndex { get; set; } // the end idx of the partition


73  public string SplittingVariable { get; set; } // the best splitting variable


74  public double SplittingThreshold { get; set; } // the best threshold


75  public double SplittingImprovement { get; set; } // the improvement of the split (for priority queue)


76  }


77 


78  // this list hold partitions with the information about the best split (organized as a sorted queue)


79  private readonly IList<PartitionSplits> queue;


80 


81  // prepare and allocate buffer variables in ctor


82  public RegressionTreeBuilder(IRegressionProblemData problemData, IRandom random) {


83  this.problemData = problemData;


84  this.random = random;


85 


86  var rows = problemData.TrainingIndices.Count();


87 


88  this.nCols = problemData.AllowedInputVariables.Count();


89 


90  allowedVariables = problemData.AllowedInputVariables.ToArray();


91  varName2Index = new Dictionary<string, int>(allowedVariables.Length);


92  for (int i = 0; i < allowedVariables.Length; i++) varName2Index.Add(allowedVariables[i], i);


93 


94  sortedIdxAll = new int[nCols][];


95  sortedIdx = new int[nCols][];


96  sumImprovements = new Dictionary<string, double>();


97  internalIdx = new int[rows];


98  which = new int[rows];


99  leftTmp = new int[rows];


100  rightTmp = new int[rows];


101  outx = new double[rows];


102  outSortedIdx = new int[rows];


103  queue = new List<PartitionSplits>(100);


104 


105  x = new double[nCols][];


106  originalY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray();


107  y = new double[originalY.Length];


108  Array.Copy(originalY, y, y.Length); // copy values (originalY is fixed, y is changed in gradient boosting)


109  curPred = Enumerable.Repeat(0.0, y.Length).ToArray(); // zeros


110 


111  int col = 0;


112  foreach (var inputVariable in problemData.AllowedInputVariables) {


113  x[col] = problemData.Dataset.GetDoubleValues(inputVariable, problemData.TrainingIndices).ToArray();


114  sortedIdxAll[col] = Enumerable.Range(0, rows).OrderBy(r => x[col][r]).ToArray();


115  sortedIdx[col] = new int[rows];


116  col++;


117  }


118  }


119 


120  // specific interface that allows to specify the target labels and the training rows which is necessary when for gradient boosted trees


121  public IRegressionModel CreateRegressionTreeForGradientBoosting(double[] y, double[] curPred, int maxSize, int[] idx, ILossFunction lossFunction, double r = 0.5, double m = 0.5) {


122  Debug.Assert(maxSize > 0);


123  Debug.Assert(r > 0);


124  Debug.Assert(r <= 1.0);


125  Debug.Assert(y.Count() == this.y.Length);


126  Debug.Assert(m > 0);


127  Debug.Assert(m <= 1.0);


128 


129  // y and curPred are changed in gradient boosting


130  this.y = y;


131  this.curPred = curPred;


132 


133  // shuffle row idx


134  HeuristicLab.Random.ListExtensions.ShuffleInPlace(idx, random);


135 


136  int nRows = idx.Count();


137 


138  // shuffle variable names


139  HeuristicLab.Random.ListExtensions.ShuffleInPlace(allowedVariables, random);


140 


141  // only select a part of the rows and columns randomly


142  effectiveRows = (int)Math.Ceiling(nRows * r);


143  effectiveVars = (int)Math.Ceiling(nCols * m);


144 


145  // the which array is used for partitioing row idxs


146  Array.Clear(which, 0, which.Length);


147 


148  // mark selected rows


149  for (int row = 0; row < effectiveRows; row++) {


150  which[idx[row]] = 1; // we use the which vector as a temporary variable here


151  internalIdx[row] = idx[row];


152  }


153 


154  for (int col = 0; col < nCols; col++) {


155  int i = 0;


156  for (int row = 0; row < nRows; row++) {


157  if (which[sortedIdxAll[col][row]] > 0) {


158  Debug.Assert(i < effectiveRows);


159  sortedIdx[col][i] = sortedIdxAll[col][row];


160  i++;


161  }


162  }


163  }


164 


165  this.tree = new RegressionTreeModel.TreeNode[maxSize];


166  this.queue.Clear();


167  this.curTreeNodeIdx = 0;


168 


169  // start out with only one leaf node (constant prediction)


170  // and calculate the best split for this root node and enqueue it into a queue sorted by improvement throught the split


171  // start and end idx are inclusive


172  CreateLeafNode(0, effectiveRows  1, lossFunction);


173 


174  // process the priority queue to complete the tree


175  CreateRegressionTreeFromQueue(maxSize, lossFunction);


176 


177  return new RegressionTreeModel(tree.ToArray(), problemData.TargetVariable);


178  }


179 


180 


181  // processes potential splits from the queue as long as splits are remaining and the maximum size of the tree is not reached


182  private void CreateRegressionTreeFromQueue(int maxNodes, ILossFunction lossFunction) {


183  while (queue.Any() && curTreeNodeIdx + 1 < maxNodes) { // two nodes are created in each loop


184  var f = queue[queue.Count  1]; // last element has the largest improvement


185  queue.RemoveAt(queue.Count  1);


186 


187  var startIdx = f.StartIdx;


188  var endIdx = f.EndIndex;


189 


190  Debug.Assert(endIdx  startIdx >= 0);


191  Debug.Assert(startIdx >= 0);


192  Debug.Assert(endIdx < internalIdx.Length);


193 


194  // split partition into left and right


195  int splitIdx;


196  SplitPartition(f.StartIdx, f.EndIndex, f.SplittingVariable, f.SplittingThreshold, out splitIdx);


197  Debug.Assert(splitIdx + 1 <= endIdx);


198  Debug.Assert(startIdx <= splitIdx);


199 


200  // create two leaf nodes (and enqueue best splits for both)


201  var leftTreeIdx = CreateLeafNode(startIdx, splitIdx, lossFunction);


202  var rightTreeIdx = CreateLeafNode(splitIdx + 1, endIdx, lossFunction);


203 


204  // overwrite existing leaf node with an internal node


205  tree[f.ParentNodeIdx] = new RegressionTreeModel.TreeNode(f.SplittingVariable, f.SplittingThreshold, leftTreeIdx, rightTreeIdx, weightLeft: (splitIdx  startIdx + 1) / (double)(endIdx  startIdx + 1));


206  }


207  }


208 


209 


210  // returns the index of the newly created tree node


211  private int CreateLeafNode(int startIdx, int endIdx, ILossFunction lossFunction) {


212  // write a leaf node


213  var val = lossFunction.LineSearch(originalY, curPred, internalIdx, startIdx, endIdx);


214  tree[curTreeNodeIdx] = new RegressionTreeModel.TreeNode(RegressionTreeModel.TreeNode.NO_VARIABLE, val);


215 


216  EnqueuePartitionSplit(curTreeNodeIdx, startIdx, endIdx);


217  curTreeNodeIdx++;


218  return curTreeNodeIdx  1;


219  }


220 


221 


222  // calculates the optimal split for the partition [startIdx .. endIdx] (inclusive)


223  // which is represented by the leaf node with the specified nodeIdx


224  private void EnqueuePartitionSplit(int nodeIdx, int startIdx, int endIdx) {


225  double threshold, improvement;


226  string bestVariableName;


227  // only enqueue a new split if there are at least 2 rows left and a split is possible


228  if (startIdx < endIdx &&


229  FindBestVariableAndThreshold(startIdx, endIdx, out threshold, out bestVariableName, out improvement)) {


230  var split = new PartitionSplits() {


231  ParentNodeIdx = nodeIdx,


232  StartIdx = startIdx,


233  EndIndex = endIdx,


234  SplittingThreshold = threshold,


235  SplittingVariable = bestVariableName


236  };


237  InsertSortedQueue(split);


238  }


239  }


240 


241 


242  // routine for splitting a partition of rows stored in internalIdx between startIdx and endIdx into


243  // a left partition and a right partition using the given splittingVariable and threshold


244  // the splitIdx is the last index of the left partition


245  // splitIdx + 1 is the first index of the right partition


246  // startIdx and endIdx are inclusive


247  private void SplitPartition(int startIdx, int endIdx, string splittingVar, double threshold, out int splitIdx) {


248  int bestVarIdx = varName2Index[splittingVar];


249  // split  two pass


250 


251  // store which index goes into which partition


252  for (int k = startIdx; k <= endIdx; k++) {


253  if (x[bestVarIdx][internalIdx[k]] <= threshold)


254  which[internalIdx[k]] = 1; // left partition


255  else


256  which[internalIdx[k]] = 1; // right partition


257  }


258 


259  // partition sortedIdx for each variable


260  int i;


261  int j;


262  for (int col = 0; col < nCols; col++) {


263  i = 0;


264  j = 0;


265  int k;


266  for (k = startIdx; k <= endIdx; k++) {


267  Debug.Assert(Math.Abs(which[sortedIdx[col][k]]) == 1);


268 


269  if (which[sortedIdx[col][k]] < 0) {


270  leftTmp[i++] = sortedIdx[col][k];


271  } else {


272  rightTmp[j++] = sortedIdx[col][k];


273  }


274  }


275  Debug.Assert(i > 0); // at least on element in the left partition


276  Debug.Assert(j > 0); // at least one element in the right partition


277  Debug.Assert(i + j == endIdx  startIdx + 1);


278  k = startIdx;


279  for (int l = 0; l < i; l++) sortedIdx[col][k++] = leftTmp[l];


280  for (int l = 0; l < j; l++) sortedIdx[col][k++] = rightTmp[l];


281  }


282 


283  // partition row indices


284  i = startIdx;


285  j = endIdx;


286  while (i <= j) {


287  Debug.Assert(Math.Abs(which[internalIdx[i]]) == 1);


288  Debug.Assert(Math.Abs(which[internalIdx[j]]) == 1);


289  if (which[internalIdx[i]] < 0) i++;


290  else if (which[internalIdx[j]] > 0) j;


291  else {


292  Debug.Assert(which[internalIdx[i]] > 0);


293  Debug.Assert(which[internalIdx[j]] < 0);


294  // swap


295  int tmp = internalIdx[i];


296  internalIdx[i] = internalIdx[j];


297  internalIdx[j] = tmp;


298  i++;


299  j;


300  }


301  }


302  Debug.Assert(j + 1 == i);


303  Debug.Assert(i <= endIdx);


304  Debug.Assert(startIdx <= j);


305 


306  splitIdx = j;


307  }


308 


309  private bool FindBestVariableAndThreshold(int startIdx, int endIdx, out double threshold, out string bestVar, out double improvement) {


310  Debug.Assert(startIdx < endIdx + 1); // at least 2 elements


311 


312  int rows = endIdx  startIdx + 1;


313  Debug.Assert(rows >= 2);


314 


315  double sumY = 0.0;


316  for (int i = startIdx; i <= endIdx; i++) {


317  sumY += y[internalIdx[i]];


318  }


319 


320  // see description of calculation in FindBestThreshold


321  double bestImprovement = 1.0 / rows * sumY * sumY; // any improvement must be larger than this baseline


322  double bestThreshold = double.PositiveInfinity;


323  bestVar = RegressionTreeModel.TreeNode.NO_VARIABLE;


324 


325  for (int col = 0; col < effectiveVars; col++) {


326  // sort values for variable to prepare for threshold selection


327  var curVariable = allowedVariables[col];


328  var curVariableIdx = varName2Index[curVariable];


329  for (int i = startIdx; i <= endIdx; i++) {


330  var sortedI = sortedIdx[curVariableIdx][i];


331  outSortedIdx[i  startIdx] = sortedI;


332  outx[i  startIdx] = x[curVariableIdx][sortedI];


333  }


334 


335  double curImprovement;


336  double curThreshold;


337  FindBestThreshold(outx, outSortedIdx, rows, y, sumY, out curThreshold, out curImprovement);


338 


339  if (curImprovement > bestImprovement) {


340  bestImprovement = curImprovement;


341  bestThreshold = curThreshold;


342  bestVar = allowedVariables[col];


343  }


344  }


345  if (bestVar == RegressionTreeModel.TreeNode.NO_VARIABLE) {


346  // not successfull


347  threshold = double.PositiveInfinity;


348  improvement = double.NegativeInfinity;


349  return false;


350  } else {


351  UpdateVariableRelevance(bestVar, sumY, bestImprovement, rows);


352  improvement = bestImprovement;


353  threshold = bestThreshold;


354  return true;


355  }


356  }


357 


358  // x [0..N1] contains rows sorted values in the range from [0..rows1]


359  // sortedIdx [0..N1] contains the idx of the values in x in the original dataset in the range from [0..rows1]


360  // rows specifies the number of valid entries in x and sortedIdx


361  // y [0..N1] contains the target values in original sorting order


362  // sumY is y.Sum()


363  //


364  // the routine returns the best threshold (x[i] + x[i+1]) / 2 for i = [0 .. rows2] by calculating the reduction in squared error


365  // additionally the reduction in squared error is returned in bestImprovement


366  // if all elements of x are equal the routing fails to produce a threshold


367  private static void FindBestThreshold(double[] x, int[] sortedIdx, int rows, double[] y, double sumY, out double bestThreshold, out double bestImprovement) {


368  Debug.Assert(rows >= 2);


369 


370  double sl = 0.0;


371  double sr = sumY;


372  double nl = 0.0;


373  double nr = rows;


374 


375  bestImprovement = 1.0 / rows * sumY * sumY; // this is the baseline for the improvement


376  bestThreshold = double.NegativeInfinity;


377  // for all thresholds


378  // if we have n rows there are n1 possible splits


379  for (int i = 0; i < rows  1; i++) {


380  sl += y[sortedIdx[i]];


381  sr = y[sortedIdx[i]];


382 


383  nl++;


384  nr;


385  Debug.Assert(nl > 0);


386  Debug.Assert(nr > 0);


387 


388  if (x[i] < x[i + 1]) { // don't try to split when two elements are equal


389 


390  // goal is to find the split with leading to minimal total variance of left and right parts


391  // without partitioning the variance is var(y) = E(y²)  E(y)²


392  // = 1/n * sum(y²)  (1/n * sum(y))²


393  //  


394  // constant baseline for improvement


395  //


396  // if we split into right and left part the overall variance is the weigthed combination nl/n * var(y_l) + nr/n * var(y_r)


397  // = nl/n * (1/nl * sum(y_l²)  (1/nl * sum(y_l))²) + nr/n * (1/nr * sum(y_r²)  (1/nr * sum(y_r))²)


398  // = 1/n * sum(y_l²)  1/nl * 1/n * sum(y_l)² + 1/n * sum(y_r²)  1/nr * 1/n * sum(y_r)²


399  // = 1/n * (sum(y_l²) + sum(y_r²))  1/n * (sum(y_l)² / nl + sum(y_r)² / nr)


400  // = 1/n * sum(y²)  1/n * (sum(y_l)² / nl + sum(y_r)² / nr)


401  // 


402  // not changed by split (and the same for total variance without partitioning)


403  //


404  // therefore we need to find the maximum value (sum(y_l)² / nl + sum(y_r)² / nr) (ignoring the factor 1/n)


405  // and this value must be larger than 1/n * sum(y)² to be an improvement over no split


406 


407  double curQuality = sl * sl / nl + sr * sr / nr;


408 


409  if (curQuality > bestImprovement) {


410  bestThreshold = (x[i] + x[i + 1]) / 2.0;


411  bestImprovement = curQuality;


412  }


413  }


414  }


415 


416  // if all elements where the same then no split can be found


417  }


418 


419 


420  private void UpdateVariableRelevance(string bestVar, double sumY, double bestImprovement, int rows) {


421  if (string.IsNullOrEmpty(bestVar)) return;


422  // update variable relevance


423  double baseLine = 1.0 / rows * sumY * sumY; // if best improvement is equal to baseline then the split had no effect


424 


425  double delta = (bestImprovement  baseLine);


426  double v;


427  if (!sumImprovements.TryGetValue(bestVar, out v)) {


428  sumImprovements[bestVar] = delta;


429  }


430  sumImprovements[bestVar] = v + delta;


431  }


432 


433  public IEnumerable<KeyValuePair<string, double>> GetVariableRelevance() {


434  // values are scaled: the most important variable has relevance = 100


435  double scaling = 100 / sumImprovements.Max(t => t.Value);


436  return


437  sumImprovements


438  .Select(t => new KeyValuePair<string, double>(t.Key, t.Value * scaling))


439  .OrderByDescending(t => t.Value);


440  }


441 


442 


443  // insert a new parition split (find insertion point and start at first element of the queue)


444  // elements are removed from the queue at the last position


445  // O(n), splits could be organized as a heap to improve runtime (see alglib tsort)


446  private void InsertSortedQueue(PartitionSplits split) {


447  // find insertion position


448  int i = 0;


449  while (i < queue.Count && queue[i].SplittingImprovement < split.SplittingImprovement) { i++; }


450 


451  queue.Insert(i, split);


452  }


453  }


454  }


455 


456 


457 


458 


459 


460 


461 


462 


463 


464 


465 


466 


467 


468 


469 


470 


471 


472 


473 

