Changeset 12623


Ignore:
Timestamp:
07/07/15 08:48:06 (4 years ago)
Author:
gkronber
Message:

#2261: comments

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/RegressionTreeBuilder.cs

    r12620 r12623  
    240240    }
    241241
    242     // routine for building the tree for the row idx stored in internalIdx between startIdx and endIdx
     242    // routine for building the tree for the partition of rows stored in internalIdx between startIdx and endIdx
    243243    // the lineSearch function calculates the optimal prediction value for tree leaf nodes
    244244    // (in the case of squared errors it is the average of target values for the rows represented by the node)
     
    248248      // split - two pass
    249249
    250       // store which index goes where
     250      // store which index goes into which partition
    251251      for (int k = startIdx; k <= endIdx; k++) {
    252252        if (x[bestVarIdx][internalIdx[k]] <= threshold)
     
    319319      }
    320320
    321       double bestImprovement = 1.0 / rows * sumY * sumY;
     321      // see description of calculation in FindBestThreshold
     322      double bestImprovement = 1.0 / rows * sumY * sumY; // any improvement must be larger than this baseline
    322323      double bestThreshold = double.PositiveInfinity;
    323324      bestVar = RegressionTreeModel.TreeNode.NO_VARIABLE;
     
    353354    }
    354355
    355     // TODO: assumption is that the Average(y) = 0
    356356    private void UpdateVariableRelevance(string bestVar, double sumY, double bestImprovement, int rows) {
    357357      if (string.IsNullOrEmpty(bestVar)) return;
    358358      // update variable relevance
    359       double err = sumY * sumY / rows;
    360       double errAfterSplit = bestImprovement;
    361 
    362       double delta = (errAfterSplit - err); // relative reduction in squared error
     359      double baseLine = 1.0 / rows * sumY * sumY; // if best improvement is equal to baseline then the split had no effect
     360
     361      double delta = (bestImprovement - baseLine);
    363362      double v;
    364363      if (!sumImprovements.TryGetValue(bestVar, out v)) {
     
    385384      double nr = rows;
    386385
    387       bestImprovement = 1.0 / rows * sumY * sumY;
     386      bestImprovement = 1.0 / rows * sumY * sumY; // this is the baseline for the improvement
    388387      bestThreshold = double.NegativeInfinity;
    389388      // for all thresholds
     
    403402          // without partitioning the variance is var(y) = E(y²) - E(y)² 
    404403          //    = 1/n * sum(y²) - (1/n * sum(y))²
    405           //      -------------
     404          //      -------------   ---------------
     405          //         constant       baseline for improvement
     406          //
    406407          // if we split into right and left part the overall variance is the weigthed combination nl/n * var(y_l) + nr/n * var(y_r) 
    407408          //    = nl/n * (1/nl * sum(y_l²) - (1/nl * sum(y_l))²) + nr/n * (1/nr * sum(y_r²) - (1/nr * sum(y_r))²)
Note: See TracChangeset for help on using the changeset viewer.