[12590]  1  #region License Information


 2  /* HeuristicLab


 3  * Copyright (C) 20022015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)


 4  * and the BEACON Center for the Study of Evolution in Action.


 5  *


 6  * This file is part of HeuristicLab.


 7  *


 8  * HeuristicLab is free software: you can redistribute it and/or modify


 9  * it under the terms of the GNU General Public License as published by


 10  * the Free Software Foundation, either version 3 of the License, or


 11  * (at your option) any later version.


 12  *


 13  * HeuristicLab is distributed in the hope that it will be useful,


 14  * but WITHOUT ANY WARRANTY; without even the implied warranty of


 15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the


 16  * GNU General Public License for more details.


 17  *


 18  * You should have received a copy of the GNU General Public License


 19  * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.


 20  */


 21  #endregion


 22 


 23  using System;


[12620]  24  using System.Collections;


[12332]  25  using System.Collections.Generic;


 26  using System.Diagnostics;


 27  using System.Linq;


 28  using HeuristicLab.Core;


 29  using HeuristicLab.Problems.DataAnalysis;


 30 


[12590]  31  namespace HeuristicLab.Algorithms.DataAnalysis {


 32  // This class implements a greedy decision tree learner which selects splits with the maximum reduction in sum of squared errors.


 33  // The tree builder also tracks variable relevance metrics based on the splits and improvement after the split.


 34  // The implementation is tuned for gradient boosting where multiple trees have to be calculated for the same training data


 35  // each time with a different target vector. Vectors of idx to allow iteration of intput variables in sorted order are


 36  // precalculated so that optimal thresholds for splits can be calculated in O(n) for each input variable.


 37  // After each split the row idx are partitioned in a left an right part.


[12661]  38  internal class RegressionTreeBuilder {


[12332]  39  private readonly IRandom random;


 40  private readonly IRegressionProblemData problemData;


 41 


 42  private readonly int nCols;


[12590]  43  private readonly double[][] x; // all training data (original order from problemData), x is constant


[12697]  44  private double[] originalY; // the original target labels (from problemData), originalY is constant


 45  private double[] curPred; // current predictions for originalY (in case we are using gradient boosting, otherwise = zeros), only necessary for line search


 46 


[12590]  47  private double[] y; // training labels (original order from problemData), y can be changed


[12332]  48 


 49  private Dictionary<string, double> sumImprovements; // for variable relevance calculation


 50 


 51  private readonly string[] allowedVariables; // all variables in shuffled order


 52  private Dictionary<string, int> varName2Index; // maps the variable names to column indexes


 53  private int effectiveVars; // number of variables that are used from allowedVariables


 54 


 55  private int effectiveRows; // number of rows that are used from


 56  private readonly int[][] sortedIdxAll;


 57  private readonly int[][] sortedIdx; // random selection from sortedIdxAll (for r < 1.0)


 58 


 59  // helper arrays which are allocated to maximal necessary size only once in the ctor


 60  private readonly int[] internalIdx, which, leftTmp, rightTmp;


 61  private readonly double[] outx;


 62  private readonly int[] outSortedIdx;


[12372]  63 


 64  private RegressionTreeModel.TreeNode[] tree; // tree is represented as a flat array of nodes


 65  private int curTreeNodeIdx; // the index where the next tree node is stored


 66 


[12632]  67  // This class represents information about potential splits.


 68  // For each node generated the best splitting variable and threshold as well as


 69  // the improvement from the split are stored in a priority queue


 70  private class PartitionSplits {


 71  public int ParentNodeIdx { get; set; } // the idx of the leaf node representing this partition


 72  public int StartIdx { get; set; } // the start idx of the partition


 73  public int EndIndex { get; set; } // the end idx of the partition


 74  public string SplittingVariable { get; set; } // the best splitting variable


 75  public double SplittingThreshold { get; set; } // the best threshold


 76  public double SplittingImprovement { get; set; } // the improvement of the split (for priority queue)


[12619]  77  }


[12332]  78 


[12632]  79  // this list hold partitions with the information about the best split (organized as a sorted queue)


 80  private readonly IList<PartitionSplits> queue;


 81 


[12332]  82  // prepare and allocate buffer variables in ctor


 83  public RegressionTreeBuilder(IRegressionProblemData problemData, IRandom random) {


 84  this.problemData = problemData;


 85  this.random = random;


 86 


 87  var rows = problemData.TrainingIndices.Count();


 88 


 89  this.nCols = problemData.AllowedInputVariables.Count();


 90 


 91  allowedVariables = problemData.AllowedInputVariables.ToArray();


 92  varName2Index = new Dictionary<string, int>(allowedVariables.Length);


 93  for (int i = 0; i < allowedVariables.Length; i++) varName2Index.Add(allowedVariables[i], i);


 94 


 95  sortedIdxAll = new int[nCols][];


 96  sortedIdx = new int[nCols][];


 97  sumImprovements = new Dictionary<string, double>();


 98  internalIdx = new int[rows];


 99  which = new int[rows];


 100  leftTmp = new int[rows];


 101  rightTmp = new int[rows];


 102  outx = new double[rows];


 103  outSortedIdx = new int[rows];


[12632]  104  queue = new List<PartitionSplits>(100);


[12332]  105 


 106  x = new double[nCols][];


[12697]  107  originalY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray();


 108  y = new double[originalY.Length];


 109  Array.Copy(originalY, y, y.Length); // copy values (originalY is fixed, y is changed in gradient boosting)


 110  curPred = Enumerable.Repeat(0.0, y.Length).ToArray(); // zeros


[12332]  111 


 112  int col = 0;


 113  foreach (var inputVariable in problemData.AllowedInputVariables) {


 114  x[col] = problemData.Dataset.GetDoubleValues(inputVariable, problemData.TrainingIndices).ToArray();


 115  sortedIdxAll[col] = Enumerable.Range(0, rows).OrderBy(r => x[col][r]).ToArray();


 116  sortedIdx[col] = new int[rows];


 117  col++;


 118  }


 119  }


 120 


[12590]  121  // specific interface that allows to specify the target labels and the training rows which is necessary when for gradient boosted trees


[12697]  122  public IRegressionModel CreateRegressionTreeForGradientBoosting(double[] y, double[] curPred, int maxSize, int[] idx, ILossFunction lossFunction, double r = 0.5, double m = 0.5) {


[12632]  123  Debug.Assert(maxSize > 0);


[12619]  124  Debug.Assert(r > 0);


 125  Debug.Assert(r <= 1.0);


 126  Debug.Assert(y.Count() == this.y.Length);


 127  Debug.Assert(m > 0);


 128  Debug.Assert(m <= 1.0);


[12332]  129 


[12697]  130  // y and curPred are changed in gradient boosting


 131  this.y = y;


 132  this.curPred = curPred;


[12332]  133 


 134  // shuffle row idx


 135  HeuristicLab.Random.ListExtensions.ShuffleInPlace(idx, random);


 136 


 137  int nRows = idx.Count();


 138 


[13992]  139  // shuffle variable names


[12332]  140  HeuristicLab.Random.ListExtensions.ShuffleInPlace(allowedVariables, random);


 141 


[12590]  142  // only select a part of the rows and columns randomly


[12332]  143  effectiveRows = (int)Math.Ceiling(nRows * r);


 144  effectiveVars = (int)Math.Ceiling(nCols * m);


 145 


[12697]  146  // the which array is used for partitioing row idxs


[12332]  147  Array.Clear(which, 0, which.Length);


 148 


 149  // mark selected rows


 150  for (int row = 0; row < effectiveRows; row++) {


[12590]  151  which[idx[row]] = 1; // we use the which vector as a temporary variable here


[12332]  152  internalIdx[row] = idx[row];


 153  }


 154 


 155  for (int col = 0; col < nCols; col++) {


 156  int i = 0;


 157  for (int row = 0; row < nRows; row++) {


 158  if (which[sortedIdxAll[col][row]] > 0) {


[12590]  159  Debug.Assert(i < effectiveRows);


[12332]  160  sortedIdx[col][i] = sortedIdxAll[col][row];


 161  i++;


 162  }


 163  }


 164  }


[12372]  165 


[12632]  166  this.tree = new RegressionTreeModel.TreeNode[maxSize];


 167  this.queue.Clear();


[12372]  168  this.curTreeNodeIdx = 0;


 169 


[12632]  170  // start out with only one leaf node (constant prediction)


 171  // and calculate the best split for this root node and enqueue it into a queue sorted by improvement throught the split


[12332]  172  // start and end idx are inclusive


[12697]  173  CreateLeafNode(0, effectiveRows  1, lossFunction);


[12619]  174 


[12632]  175  // process the priority queue to complete the tree


[12697]  176  CreateRegressionTreeFromQueue(maxSize, lossFunction);


[12632]  177 


[13992]  178  return new RegressionTreeModel(tree.ToArray(), problemData.TargetVariable, );


[12332]  179  }


 180 


 181 


[13895]  182  // processes potential splits from the queue as long as splits are remaining and the maximum size of the tree is not reached


[12697]  183  private void CreateRegressionTreeFromQueue(int maxNodes, ILossFunction lossFunction) {


[12632]  184  while (queue.Any() && curTreeNodeIdx + 1 < maxNodes) { // two nodes are created in each loop


 185  var f = queue[queue.Count  1]; // last element has the largest improvement


 186  queue.RemoveAt(queue.Count  1);


 187 


[12619]  188  var startIdx = f.StartIdx;


 189  var endIdx = f.EndIndex;


 190 


 191  Debug.Assert(endIdx  startIdx >= 0);


 192  Debug.Assert(startIdx >= 0);


 193  Debug.Assert(endIdx < internalIdx.Length);


 194 


[12632]  195  // split partition into left and right


 196  int splitIdx;


 197  SplitPartition(f.StartIdx, f.EndIndex, f.SplittingVariable, f.SplittingThreshold, out splitIdx);


 198  Debug.Assert(splitIdx + 1 <= endIdx);


 199  Debug.Assert(startIdx <= splitIdx);


[12332]  200 


[12632]  201  // create two leaf nodes (and enqueue best splits for both)


[12697]  202  var leftTreeIdx = CreateLeafNode(startIdx, splitIdx, lossFunction);


 203  var rightTreeIdx = CreateLeafNode(splitIdx + 1, endIdx, lossFunction);


[12658]  204 


 205  // overwrite existing leaf node with an internal node


[13895]  206  tree[f.ParentNodeIdx] = new RegressionTreeModel.TreeNode(f.SplittingVariable, f.SplittingThreshold, leftTreeIdx, rightTreeIdx, weightLeft: (splitIdx  startIdx + 1) / (double)(endIdx  startIdx + 1));


[12632]  207  }


 208  }


[12332]  209 


 210 


[12632]  211  // returns the index of the newly created tree node


[12697]  212  private int CreateLeafNode(int startIdx, int endIdx, ILossFunction lossFunction) {


[12658]  213  // write a leaf node


[12697]  214  var val = lossFunction.LineSearch(originalY, curPred, internalIdx, startIdx, endIdx);


[12658]  215  tree[curTreeNodeIdx] = new RegressionTreeModel.TreeNode(RegressionTreeModel.TreeNode.NO_VARIABLE, val);


[12332]  216 


[12632]  217  EnqueuePartitionSplit(curTreeNodeIdx, startIdx, endIdx);


 218  curTreeNodeIdx++;


 219  return curTreeNodeIdx  1;


[12619]  220  }


[12332]  221 


 222 


[12632]  223  // calculates the optimal split for the partition [startIdx .. endIdx] (inclusive)


 224  // which is represented by the leaf node with the specified nodeIdx


 225  private void EnqueuePartitionSplit(int nodeIdx, int startIdx, int endIdx) {


 226  double threshold, improvement;


 227  string bestVariableName;


 228  // only enqueue a new split if there are at least 2 rows left and a split is possible


 229  if (startIdx < endIdx &&


 230  FindBestVariableAndThreshold(startIdx, endIdx, out threshold, out bestVariableName, out improvement)) {


 231  var split = new PartitionSplits() {


 232  ParentNodeIdx = nodeIdx,


 233  StartIdx = startIdx,


 234  EndIndex = endIdx,


 235  SplittingThreshold = threshold,


 236  SplittingVariable = bestVariableName


 237  };


 238  InsertSortedQueue(split);


 239  }


[12619]  240  }


[12332]  241 


[12632]  242 


 243  // routine for splitting a partition of rows stored in internalIdx between startIdx and endIdx into


 244  // a left partition and a right partition using the given splittingVariable and threshold


 245  // the splitIdx is the last index of the left partition


 246  // splitIdx + 1 is the first index of the right partition


[12619]  247  // startIdx and endIdx are inclusive


[12632]  248  private void SplitPartition(int startIdx, int endIdx, string splittingVar, double threshold, out int splitIdx) {


[12619]  249  int bestVarIdx = varName2Index[splittingVar];


 250  // split  two pass


[12332]  251 


[12623]  252  // store which index goes into which partition


[12619]  253  for (int k = startIdx; k <= endIdx; k++) {


 254  if (x[bestVarIdx][internalIdx[k]] <= threshold)


 255  which[internalIdx[k]] = 1; // left partition


 256  else


 257  which[internalIdx[k]] = 1; // right partition


 258  }


[12372]  259 


[12619]  260  // partition sortedIdx for each variable


 261  int i;


 262  int j;


 263  for (int col = 0; col < nCols; col++) {


 264  i = 0;


 265  j = 0;


 266  int k;


 267  for (k = startIdx; k <= endIdx; k++) {


 268  Debug.Assert(Math.Abs(which[sortedIdx[col][k]]) == 1);


 269 


 270  if (which[sortedIdx[col][k]] < 0) {


 271  leftTmp[i++] = sortedIdx[col][k];


 272  } else {


 273  rightTmp[j++] = sortedIdx[col][k];


 274  }


[12332]  275  }


[12619]  276  Debug.Assert(i > 0); // at least on element in the left partition


 277  Debug.Assert(j > 0); // at least one element in the right partition


 278  Debug.Assert(i + j == endIdx  startIdx + 1);


 279  k = startIdx;


 280  for (int l = 0; l < i; l++) sortedIdx[col][k++] = leftTmp[l];


 281  for (int l = 0; l < j; l++) sortedIdx[col][k++] = rightTmp[l];


[12332]  282  }


[12619]  283 


 284  // partition row indices


 285  i = startIdx;


 286  j = endIdx;


 287  while (i <= j) {


 288  Debug.Assert(Math.Abs(which[internalIdx[i]]) == 1);


 289  Debug.Assert(Math.Abs(which[internalIdx[j]]) == 1);


 290  if (which[internalIdx[i]] < 0) i++;


 291  else if (which[internalIdx[j]] > 0) j;


 292  else {


 293  Debug.Assert(which[internalIdx[i]] > 0);


 294  Debug.Assert(which[internalIdx[j]] < 0);


 295  // swap


 296  int tmp = internalIdx[i];


 297  internalIdx[i] = internalIdx[j];


 298  internalIdx[j] = tmp;


 299  i++;


 300  j;


 301  }


 302  }


 303  Debug.Assert(j + 1 == i);


 304  Debug.Assert(i <= endIdx);


 305  Debug.Assert(startIdx <= j);


 306 


 307  splitIdx = j;


[12332]  308  }


 309 


[12632]  310  private bool FindBestVariableAndThreshold(int startIdx, int endIdx, out double threshold, out string bestVar, out double improvement) {


[12619]  311  Debug.Assert(startIdx < endIdx + 1); // at least 2 elements


[12332]  312 


 313  int rows = endIdx  startIdx + 1;


[12619]  314  Debug.Assert(rows >= 2);


[12332]  315 


 316  double sumY = 0.0;


 317  for (int i = startIdx; i <= endIdx; i++) {


 318  sumY += y[internalIdx[i]];


 319  }


 320 


[12623]  321  // see description of calculation in FindBestThreshold


 322  double bestImprovement = 1.0 / rows * sumY * sumY; // any improvement must be larger than this baseline


[12332]  323  double bestThreshold = double.PositiveInfinity;


[12619]  324  bestVar = RegressionTreeModel.TreeNode.NO_VARIABLE;


[12332]  325 


 326  for (int col = 0; col < effectiveVars; col++) {


 327  // sort values for variable to prepare for threshold selection


 328  var curVariable = allowedVariables[col];


 329  var curVariableIdx = varName2Index[curVariable];


 330  for (int i = startIdx; i <= endIdx; i++) {


 331  var sortedI = sortedIdx[curVariableIdx][i];


 332  outSortedIdx[i  startIdx] = sortedI;


 333  outx[i  startIdx] = x[curVariableIdx][sortedI];


 334  }


 335 


 336  double curImprovement;


 337  double curThreshold;


 338  FindBestThreshold(outx, outSortedIdx, rows, y, sumY, out curThreshold, out curImprovement);


 339 


 340  if (curImprovement > bestImprovement) {


 341  bestImprovement = curImprovement;


 342  bestThreshold = curThreshold;


 343  bestVar = allowedVariables[col];


 344  }


 345  }


[12619]  346  if (bestVar == RegressionTreeModel.TreeNode.NO_VARIABLE) {


[12632]  347  // not successfull


 348  threshold = double.PositiveInfinity;


 349  improvement = double.NegativeInfinity;


[12619]  350  return false;


 351  } else {


 352  UpdateVariableRelevance(bestVar, sumY, bestImprovement, rows);


[12632]  353  improvement = bestImprovement;


[12619]  354  threshold = bestThreshold;


 355  return true;


 356  }


[12332]  357  }


 358 


 359  // x [0..N1] contains rows sorted values in the range from [0..rows1]


 360  // sortedIdx [0..N1] contains the idx of the values in x in the original dataset in the range from [0..rows1]


 361  // rows specifies the number of valid entries in x and sortedIdx


 362  // y [0..N1] contains the target values in original sorting order


 363  // sumY is y.Sum()


 364  //


 365  // the routine returns the best threshold (x[i] + x[i+1]) / 2 for i = [0 .. rows2] by calculating the reduction in squared error


 366  // additionally the reduction in squared error is returned in bestImprovement


 367  // if all elements of x are equal the routing fails to produce a threshold


 368  private static void FindBestThreshold(double[] x, int[] sortedIdx, int rows, double[] y, double sumY, out double bestThreshold, out double bestImprovement) {


[12619]  369  Debug.Assert(rows >= 2);


[12332]  370 


 371  double sl = 0.0;


 372  double sr = sumY;


 373  double nl = 0.0;


 374  double nr = rows;


 375 


[12623]  376  bestImprovement = 1.0 / rows * sumY * sumY; // this is the baseline for the improvement


[12332]  377  bestThreshold = double.NegativeInfinity;


 378  // for all thresholds


 379  // if we have n rows there are n1 possible splits


 380  for (int i = 0; i < rows  1; i++) {


 381  sl += y[sortedIdx[i]];


 382  sr = y[sortedIdx[i]];


 383 


 384  nl++;


 385  nr;


 386  Debug.Assert(nl > 0);


 387  Debug.Assert(nr > 0);


 388 


 389  if (x[i] < x[i + 1]) { // don't try to split when two elements are equal


[12620]  390 


 391  // goal is to find the split with leading to minimal total variance of left and right parts


 392  // without partitioning the variance is var(y) = E(y²)  E(y)²


 393  // = 1/n * sum(y²)  (1/n * sum(y))²


[12623]  394  //  


 395  // constant baseline for improvement


 396  //


[12620]  397  // if we split into right and left part the overall variance is the weigthed combination nl/n * var(y_l) + nr/n * var(y_r)


 398  // = nl/n * (1/nl * sum(y_l²)  (1/nl * sum(y_l))²) + nr/n * (1/nr * sum(y_r²)  (1/nr * sum(y_r))²)


 399  // = 1/n * sum(y_l²)  1/nl * 1/n * sum(y_l)² + 1/n * sum(y_r²)  1/nr * 1/n * sum(y_r)²


 400  // = 1/n * (sum(y_l²) + sum(y_r²))  1/n * (sum(y_l)² / nl + sum(y_r)² / nr)


 401  // = 1/n * sum(y²)  1/n * (sum(y_l)² / nl + sum(y_r)² / nr)


 402  // 


 403  // not changed by split (and the same for total variance without partitioning)


 404  //


 405  // therefore we need to find the maximum value (sum(y_l)² / nl + sum(y_r)² / nr) (ignoring the factor 1/n)


 406  // and this value must be larger than 1/n * sum(y)² to be an improvement over no split


 407 


[12332]  408  double curQuality = sl * sl / nl + sr * sr / nr;


 409 


 410  if (curQuality > bestImprovement) {


 411  bestThreshold = (x[i] + x[i + 1]) / 2.0;


 412  bestImprovement = curQuality;


 413  }


 414  }


 415  }


 416 


 417  // if all elements where the same then no split can be found


 418  }


 419 


 420 


[12632]  421  private void UpdateVariableRelevance(string bestVar, double sumY, double bestImprovement, int rows) {


 422  if (string.IsNullOrEmpty(bestVar)) return;


 423  // update variable relevance


 424  double baseLine = 1.0 / rows * sumY * sumY; // if best improvement is equal to baseline then the split had no effect


 425 


 426  double delta = (bestImprovement  baseLine);


 427  double v;


 428  if (!sumImprovements.TryGetValue(bestVar, out v)) {


 429  sumImprovements[bestVar] = delta;


 430  }


 431  sumImprovements[bestVar] = v + delta;


 432  }


 433 


[12332]  434  public IEnumerable<KeyValuePair<string, double>> GetVariableRelevance() {


[12590]  435  // values are scaled: the most important variable has relevance = 100


[12332]  436  double scaling = 100 / sumImprovements.Max(t => t.Value);


 437  return


 438  sumImprovements


 439  .Select(t => new KeyValuePair<string, double>(t.Key, t.Value * scaling))


 440  .OrderByDescending(t => t.Value);


 441  }


[12632]  442 


 443 


 444  // insert a new parition split (find insertion point and start at first element of the queue)


 445  // elements are removed from the queue at the last position


 446  // O(n), splits could be organized as a heap to improve runtime (see alglib tsort)


 447  private void InsertSortedQueue(PartitionSplits split) {


 448  // find insertion position


 449  int i = 0;


 450  while (i < queue.Count && queue[i].SplittingImprovement < split.SplittingImprovement) { i++; }


 451 


 452  queue.Insert(i, split);


 453  }


[12332]  454  }


 455  }


 456 


 457 


 458 


 459 


 460 


 461 


 462 


 463 


 464 


 465 


 466 


 467 


 468 


 469 


 470 


 471 


 472 


 473 


 474 

