1  #region License Information


2  /* HeuristicLab


3  * Copyright (C) 20022015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)


4  * and the BEACON Center for the Study of Evolution in Action.


5  *


6  * This file is part of HeuristicLab.


7  *


8  * HeuristicLab is free software: you can redistribute it and/or modify


9  * it under the terms of the GNU General Public License as published by


10  * the Free Software Foundation, either version 3 of the License, or


11  * (at your option) any later version.


12  *


13  * HeuristicLab is distributed in the hope that it will be useful,


14  * but WITHOUT ANY WARRANTY; without even the implied warranty of


15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the


16  * GNU General Public License for more details.


17  *


18  * You should have received a copy of the GNU General Public License


19  * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.


20  */


21  #endregion


22 


23  using System;


24  using System.Collections;


25  using System.Collections.Generic;


26  using System.Diagnostics;


27  using System.Linq;


28  using HeuristicLab.Core;


29  using HeuristicLab.Problems.DataAnalysis;


30 


31  namespace HeuristicLab.Algorithms.DataAnalysis {


32  // This class implements a greedy decision tree learner which selects splits with the maximum reduction in sum of squared errors.


33  // The tree builder also tracks variable relevance metrics based on the splits and improvement after the split.


34  // The implementation is tuned for gradient boosting where multiple trees have to be calculated for the same training data


35  // each time with a different target vector. Vectors of idx to allow iteration of intput variables in sorted order are


36  // precalculated so that optimal thresholds for splits can be calculated in O(n) for each input variable.


37  // After each split the row idx are partitioned in a left an right part.


38  internal class RegressionTreeBuilder {


39  private readonly IRandom random;


40  private readonly IRegressionProblemData problemData;


41 


42  private readonly int nCols;


43  private readonly double[][] x; // all training data (original order from problemData), x is constant


44  private double[] originalY; // the original target labels (from problemData), originalY is constant


45  private double[] curPred; // current predictions for originalY (in case we are using gradient boosting, otherwise = zeros), only necessary for line search


46 


47  private double[] y; // training labels (original order from problemData), y can be changed


48 


49  private Dictionary<string, double> sumImprovements; // for variable relevance calculation


50 


51  private readonly string[] allowedVariables; // all variables in shuffled order


52  private Dictionary<string, int> varName2Index; // maps the variable names to column indexes


53  private int effectiveVars; // number of variables that are used from allowedVariables


54 


55  private int effectiveRows; // number of rows that are used from


56  private readonly int[][] sortedIdxAll;


57  private readonly int[][] sortedIdx; // random selection from sortedIdxAll (for r < 1.0)


58 


59  // helper arrays which are allocated to maximal necessary size only once in the ctor


60  private readonly int[] internalIdx, which, leftTmp, rightTmp;


61  private readonly double[] outx;


62  private readonly int[] outSortedIdx;


63 


64  private RegressionTreeModel.TreeNode[] tree; // tree is represented as a flat array of nodes


65  private int curTreeNodeIdx; // the index where the next tree node is stored


66 


67  // This class represents information about potential splits.


68  // For each node generated the best splitting variable and threshold as well as


69  // the improvement from the split are stored in a priority queue


70  private class PartitionSplits {


71  public int ParentNodeIdx { get; set; } // the idx of the leaf node representing this partition


72  public int StartIdx { get; set; } // the start idx of the partition


73  public int EndIndex { get; set; } // the end idx of the partition


74  public string SplittingVariable { get; set; } // the best splitting variable


75  public double SplittingThreshold { get; set; } // the best threshold


76  public double SplittingImprovement { get; set; } // the improvement of the split (for priority queue)


77  }


78 


79  // this list hold partitions with the information about the best split (organized as a sorted queue)


80  private readonly IList<PartitionSplits> queue;


81 


82  // prepare and allocate buffer variables in ctor


83  public RegressionTreeBuilder(IRegressionProblemData problemData, IRandom random) {


84  this.problemData = problemData;


85  this.random = random;


86 


87  var rows = problemData.TrainingIndices.Count();


88 


89  this.nCols = problemData.AllowedInputVariables.Count();


90 


91  allowedVariables = problemData.AllowedInputVariables.ToArray();


92  varName2Index = new Dictionary<string, int>(allowedVariables.Length);


93  for (int i = 0; i < allowedVariables.Length; i++) varName2Index.Add(allowedVariables[i], i);


94 


95  sortedIdxAll = new int[nCols][];


96  sortedIdx = new int[nCols][];


97  sumImprovements = new Dictionary<string, double>();


98  internalIdx = new int[rows];


99  which = new int[rows];


100  leftTmp = new int[rows];


101  rightTmp = new int[rows];


102  outx = new double[rows];


103  outSortedIdx = new int[rows];


104  queue = new List<PartitionSplits>(100);


105 


106  x = new double[nCols][];


107  originalY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray();


108  y = new double[originalY.Length];


109  Array.Copy(originalY, y, y.Length); // copy values (originalY is fixed, y is changed in gradient boosting)


110  curPred = Enumerable.Repeat(0.0, y.Length).ToArray(); // zeros


111 


112  int col = 0;


113  foreach (var inputVariable in problemData.AllowedInputVariables) {


114  x[col] = problemData.Dataset.GetDoubleValues(inputVariable, problemData.TrainingIndices).ToArray();


115  sortedIdxAll[col] = Enumerable.Range(0, rows).OrderBy(r => x[col][r]).ToArray();


116  sortedIdx[col] = new int[rows];


117  col++;


118  }


119  }


120 


121  // specific interface that allows to specify the target labels and the training rows which is necessary when for gradient boosted trees


122  public IRegressionModel CreateRegressionTreeForGradientBoosting(double[] y, double[] curPred, int maxSize, int[] idx, ILossFunction lossFunction, double r = 0.5, double m = 0.5) {


123  Debug.Assert(maxSize > 0);


124  Debug.Assert(r > 0);


125  Debug.Assert(r <= 1.0);


126  Debug.Assert(y.Count() == this.y.Length);


127  Debug.Assert(m > 0);


128  Debug.Assert(m <= 1.0);


129 


130  // y and curPred are changed in gradient boosting


131  this.y = y;


132  this.curPred = curPred;


133 


134  // shuffle row idx


135  HeuristicLab.Random.ListExtensions.ShuffleInPlace(idx, random);


136 


137  int nRows = idx.Count();


138 


139  // shuffle variable idx


140  HeuristicLab.Random.ListExtensions.ShuffleInPlace(allowedVariables, random);


141 


142  // only select a part of the rows and columns randomly


143  effectiveRows = (int)Math.Ceiling(nRows * r);


144  effectiveVars = (int)Math.Ceiling(nCols * m);


145 


146  // the which array is used for partitioing row idxs


147  Array.Clear(which, 0, which.Length);


148 


149  // mark selected rows


150  for (int row = 0; row < effectiveRows; row++) {


151  which[idx[row]] = 1; // we use the which vector as a temporary variable here


152  internalIdx[row] = idx[row];


153  }


154 


155  for (int col = 0; col < nCols; col++) {


156  int i = 0;


157  for (int row = 0; row < nRows; row++) {


158  if (which[sortedIdxAll[col][row]] > 0) {


159  Debug.Assert(i < effectiveRows);


160  sortedIdx[col][i] = sortedIdxAll[col][row];


161  i++;


162  }


163  }


164  }


165 


166  this.tree = new RegressionTreeModel.TreeNode[maxSize];


167  this.queue.Clear();


168  this.curTreeNodeIdx = 0;


169 


170  // start out with only one leaf node (constant prediction)


171  // and calculate the best split for this root node and enqueue it into a queue sorted by improvement throught the split


172  // start and end idx are inclusive


173  CreateLeafNode(0, effectiveRows  1, lossFunction);


174 


175  // process the priority queue to complete the tree


176  CreateRegressionTreeFromQueue(maxSize, lossFunction);


177 


178  return new RegressionTreeModel(tree.ToArray());


179  }


180 


181 


182  // processes potential splits from the queue as long as splits are left and the maximum size of the tree is not reached


183  private void CreateRegressionTreeFromQueue(int maxNodes, ILossFunction lossFunction) {


184  while (queue.Any() && curTreeNodeIdx + 1 < maxNodes) { // two nodes are created in each loop


185  var f = queue[queue.Count  1]; // last element has the largest improvement


186  queue.RemoveAt(queue.Count  1);


187 


188  var startIdx = f.StartIdx;


189  var endIdx = f.EndIndex;


190 


191  Debug.Assert(endIdx  startIdx >= 0);


192  Debug.Assert(startIdx >= 0);


193  Debug.Assert(endIdx < internalIdx.Length);


194 


195  // split partition into left and right


196  int splitIdx;


197  SplitPartition(f.StartIdx, f.EndIndex, f.SplittingVariable, f.SplittingThreshold, out splitIdx);


198  Debug.Assert(splitIdx + 1 <= endIdx);


199  Debug.Assert(startIdx <= splitIdx);


200 


201  // create two leaf nodes (and enqueue best splits for both)


202  var leftTreeIdx = CreateLeafNode(startIdx, splitIdx, lossFunction);


203  var rightTreeIdx = CreateLeafNode(splitIdx + 1, endIdx, lossFunction);


204 


205  // overwrite existing leaf node with an internal node


206  tree[f.ParentNodeIdx] = new RegressionTreeModel.TreeNode(f.SplittingVariable, f.SplittingThreshold, leftTreeIdx, rightTreeIdx);


207  }


208  }


209 


210 


211  // returns the index of the newly created tree node


212  private int CreateLeafNode(int startIdx, int endIdx, ILossFunction lossFunction) {


213  // write a leaf node


214  var val = lossFunction.LineSearch(originalY, curPred, internalIdx, startIdx, endIdx);


215  tree[curTreeNodeIdx] = new RegressionTreeModel.TreeNode(RegressionTreeModel.TreeNode.NO_VARIABLE, val);


216 


217  EnqueuePartitionSplit(curTreeNodeIdx, startIdx, endIdx);


218  curTreeNodeIdx++;


219  return curTreeNodeIdx  1;


220  }


221 


222 


223  // calculates the optimal split for the partition [startIdx .. endIdx] (inclusive)


224  // which is represented by the leaf node with the specified nodeIdx


225  private void EnqueuePartitionSplit(int nodeIdx, int startIdx, int endIdx) {


226  double threshold, improvement;


227  string bestVariableName;


228  // only enqueue a new split if there are at least 2 rows left and a split is possible


229  if (startIdx < endIdx &&


230  FindBestVariableAndThreshold(startIdx, endIdx, out threshold, out bestVariableName, out improvement)) {


231  var split = new PartitionSplits() {


232  ParentNodeIdx = nodeIdx,


233  StartIdx = startIdx,


234  EndIndex = endIdx,


235  SplittingThreshold = threshold,


236  SplittingVariable = bestVariableName


237  };


238  InsertSortedQueue(split);


239  }


240  }


241 


242 


243  // routine for splitting a partition of rows stored in internalIdx between startIdx and endIdx into


244  // a left partition and a right partition using the given splittingVariable and threshold


245  // the splitIdx is the last index of the left partition


246  // splitIdx + 1 is the first index of the right partition


247  // startIdx and endIdx are inclusive


248  private void SplitPartition(int startIdx, int endIdx, string splittingVar, double threshold, out int splitIdx) {


249  int bestVarIdx = varName2Index[splittingVar];


250  // split  two pass


251 


252  // store which index goes into which partition


253  for (int k = startIdx; k <= endIdx; k++) {


254  if (x[bestVarIdx][internalIdx[k]] <= threshold)


255  which[internalIdx[k]] = 1; // left partition


256  else


257  which[internalIdx[k]] = 1; // right partition


258  }


259 


260  // partition sortedIdx for each variable


261  int i;


262  int j;


263  for (int col = 0; col < nCols; col++) {


264  i = 0;


265  j = 0;


266  int k;


267  for (k = startIdx; k <= endIdx; k++) {


268  Debug.Assert(Math.Abs(which[sortedIdx[col][k]]) == 1);


269 


270  if (which[sortedIdx[col][k]] < 0) {


271  leftTmp[i++] = sortedIdx[col][k];


272  } else {


273  rightTmp[j++] = sortedIdx[col][k];


274  }


275  }


276  Debug.Assert(i > 0); // at least on element in the left partition


277  Debug.Assert(j > 0); // at least one element in the right partition


278  Debug.Assert(i + j == endIdx  startIdx + 1);


279  k = startIdx;


280  for (int l = 0; l < i; l++) sortedIdx[col][k++] = leftTmp[l];


281  for (int l = 0; l < j; l++) sortedIdx[col][k++] = rightTmp[l];


282  }


283 


284  // partition row indices


285  i = startIdx;


286  j = endIdx;


287  while (i <= j) {


288  Debug.Assert(Math.Abs(which[internalIdx[i]]) == 1);


289  Debug.Assert(Math.Abs(which[internalIdx[j]]) == 1);


290  if (which[internalIdx[i]] < 0) i++;


291  else if (which[internalIdx[j]] > 0) j;


292  else {


293  Debug.Assert(which[internalIdx[i]] > 0);


294  Debug.Assert(which[internalIdx[j]] < 0);


295  // swap


296  int tmp = internalIdx[i];


297  internalIdx[i] = internalIdx[j];


298  internalIdx[j] = tmp;


299  i++;


300  j;


301  }


302  }


303  Debug.Assert(j + 1 == i);


304  Debug.Assert(i <= endIdx);


305  Debug.Assert(startIdx <= j);


306 


307  splitIdx = j;


308  }


309 


310  private bool FindBestVariableAndThreshold(int startIdx, int endIdx, out double threshold, out string bestVar, out double improvement) {


311  Debug.Assert(startIdx < endIdx + 1); // at least 2 elements


312 


313  int rows = endIdx  startIdx + 1;


314  Debug.Assert(rows >= 2);


315 


316  double sumY = 0.0;


317  for (int i = startIdx; i <= endIdx; i++) {


318  sumY += y[internalIdx[i]];


319  }


320 


321  // see description of calculation in FindBestThreshold


322  double bestImprovement = 1.0 / rows * sumY * sumY; // any improvement must be larger than this baseline


323  double bestThreshold = double.PositiveInfinity;


324  bestVar = RegressionTreeModel.TreeNode.NO_VARIABLE;


325 


326  for (int col = 0; col < effectiveVars; col++) {


327  // sort values for variable to prepare for threshold selection


328  var curVariable = allowedVariables[col];


329  var curVariableIdx = varName2Index[curVariable];


330  for (int i = startIdx; i <= endIdx; i++) {


331  var sortedI = sortedIdx[curVariableIdx][i];


332  outSortedIdx[i  startIdx] = sortedI;


333  outx[i  startIdx] = x[curVariableIdx][sortedI];


334  }


335 


336  double curImprovement;


337  double curThreshold;


338  FindBestThreshold(outx, outSortedIdx, rows, y, sumY, out curThreshold, out curImprovement);


339 


340  if (curImprovement > bestImprovement) {


341  bestImprovement = curImprovement;


342  bestThreshold = curThreshold;


343  bestVar = allowedVariables[col];


344  }


345  }


346  if (bestVar == RegressionTreeModel.TreeNode.NO_VARIABLE) {


347  // not successfull


348  threshold = double.PositiveInfinity;


349  improvement = double.NegativeInfinity;


350  return false;


351  } else {


352  UpdateVariableRelevance(bestVar, sumY, bestImprovement, rows);


353  improvement = bestImprovement;


354  threshold = bestThreshold;


355  return true;


356  }


357  }


358 


359  // x [0..N1] contains rows sorted values in the range from [0..rows1]


360  // sortedIdx [0..N1] contains the idx of the values in x in the original dataset in the range from [0..rows1]


361  // rows specifies the number of valid entries in x and sortedIdx


362  // y [0..N1] contains the target values in original sorting order


363  // sumY is y.Sum()


364  //


365  // the routine returns the best threshold (x[i] + x[i+1]) / 2 for i = [0 .. rows2] by calculating the reduction in squared error


366  // additionally the reduction in squared error is returned in bestImprovement


367  // if all elements of x are equal the routing fails to produce a threshold


368  private static void FindBestThreshold(double[] x, int[] sortedIdx, int rows, double[] y, double sumY, out double bestThreshold, out double bestImprovement) {


369  Debug.Assert(rows >= 2);


370 


371  double sl = 0.0;


372  double sr = sumY;


373  double nl = 0.0;


374  double nr = rows;


375 


376  bestImprovement = 1.0 / rows * sumY * sumY; // this is the baseline for the improvement


377  bestThreshold = double.NegativeInfinity;


378  // for all thresholds


379  // if we have n rows there are n1 possible splits


380  for (int i = 0; i < rows  1; i++) {


381  sl += y[sortedIdx[i]];


382  sr = y[sortedIdx[i]];


383 


384  nl++;


385  nr;


386  Debug.Assert(nl > 0);


387  Debug.Assert(nr > 0);


388 


389  if (x[i] < x[i + 1]) { // don't try to split when two elements are equal


390 


391  // goal is to find the split with leading to minimal total variance of left and right parts


392  // without partitioning the variance is var(y) = E(y²)  E(y)²


393  // = 1/n * sum(y²)  (1/n * sum(y))²


394  //  


395  // constant baseline for improvement


396  //


397  // if we split into right and left part the overall variance is the weigthed combination nl/n * var(y_l) + nr/n * var(y_r)


398  // = nl/n * (1/nl * sum(y_l²)  (1/nl * sum(y_l))²) + nr/n * (1/nr * sum(y_r²)  (1/nr * sum(y_r))²)


399  // = 1/n * sum(y_l²)  1/nl * 1/n * sum(y_l)² + 1/n * sum(y_r²)  1/nr * 1/n * sum(y_r)²


400  // = 1/n * (sum(y_l²) + sum(y_r²))  1/n * (sum(y_l)² / nl + sum(y_r)² / nr)


401  // = 1/n * sum(y²)  1/n * (sum(y_l)² / nl + sum(y_r)² / nr)


402  // 


403  // not changed by split (and the same for total variance without partitioning)


404  //


405  // therefore we need to find the maximum value (sum(y_l)² / nl + sum(y_r)² / nr) (ignoring the factor 1/n)


406  // and this value must be larger than 1/n * sum(y)² to be an improvement over no split


407 


408  double curQuality = sl * sl / nl + sr * sr / nr;


409 


410  if (curQuality > bestImprovement) {


411  bestThreshold = (x[i] + x[i + 1]) / 2.0;


412  bestImprovement = curQuality;


413  }


414  }


415  }


416 


417  // if all elements where the same then no split can be found


418  }


419 


420 


421  private void UpdateVariableRelevance(string bestVar, double sumY, double bestImprovement, int rows) {


422  if (string.IsNullOrEmpty(bestVar)) return;


423  // update variable relevance


424  double baseLine = 1.0 / rows * sumY * sumY; // if best improvement is equal to baseline then the split had no effect


425 


426  double delta = (bestImprovement  baseLine);


427  double v;


428  if (!sumImprovements.TryGetValue(bestVar, out v)) {


429  sumImprovements[bestVar] = delta;


430  }


431  sumImprovements[bestVar] = v + delta;


432  }


433 


434  public IEnumerable<KeyValuePair<string, double>> GetVariableRelevance() {


435  // values are scaled: the most important variable has relevance = 100


436  double scaling = 100 / sumImprovements.Max(t => t.Value);


437  return


438  sumImprovements


439  .Select(t => new KeyValuePair<string, double>(t.Key, t.Value * scaling))


440  .OrderByDescending(t => t.Value);


441  }


442 


443 


444  // insert a new parition split (find insertion point and start at first element of the queue)


445  // elements are removed from the queue at the last position


446  // O(n), splits could be organized as a heap to improve runtime (see alglib tsort)


447  private void InsertSortedQueue(PartitionSplits split) {


448  // find insertion position


449  int i = 0;


450  while (i < queue.Count && queue[i].SplittingImprovement < split.SplittingImprovement) { i++; }


451 


452  queue.Insert(i, split);


453  }


454  }


455  }


456 


457 


458 


459 


460 


461 


462 


463 


464 


465 


466 


467 


468 


469 


470 


471 


472 


473 


474 

