Changeset 12623
- Timestamp:
- 07/07/15 08:48:06 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/RegressionTreeBuilder.cs
r12620 r12623 240 240 } 241 241 242 // routine for building the tree for the row idxstored in internalIdx between startIdx and endIdx242 // routine for building the tree for the partition of rows stored in internalIdx between startIdx and endIdx 243 243 // the lineSearch function calculates the optimal prediction value for tree leaf nodes 244 244 // (in the case of squared errors it is the average of target values for the rows represented by the node) … … 248 248 // split - two pass 249 249 250 // store which index goes where250 // store which index goes into which partition 251 251 for (int k = startIdx; k <= endIdx; k++) { 252 252 if (x[bestVarIdx][internalIdx[k]] <= threshold) … … 319 319 } 320 320 321 double bestImprovement = 1.0 / rows * sumY * sumY; 321 // see description of calculation in FindBestThreshold 322 double bestImprovement = 1.0 / rows * sumY * sumY; // any improvement must be larger than this baseline 322 323 double bestThreshold = double.PositiveInfinity; 323 324 bestVar = RegressionTreeModel.TreeNode.NO_VARIABLE; … … 353 354 } 354 355 355 // TODO: assumption is that the Average(y) = 0356 356 private void UpdateVariableRelevance(string bestVar, double sumY, double bestImprovement, int rows) { 357 357 if (string.IsNullOrEmpty(bestVar)) return; 358 358 // update variable relevance 359 double err = sumY * sumY / rows; 360 double errAfterSplit = bestImprovement; 361 362 double delta = (errAfterSplit - err); // relative reduction in squared error 359 double baseLine = 1.0 / rows * sumY * sumY; // if best improvement is equal to baseline then the split had no effect 360 361 double delta = (bestImprovement - baseLine); 363 362 double v; 364 363 if (!sumImprovements.TryGetValue(bestVar, out v)) { … … 385 384 double nr = rows; 386 385 387 bestImprovement = 1.0 / rows * sumY * sumY; 386 bestImprovement = 1.0 / rows * sumY * sumY; // this is the baseline for the improvement 388 387 bestThreshold = double.NegativeInfinity; 389 388 // for all thresholds … … 403 402 // without partitioning the variance is var(y) = E(y²) - E(y)² 404 403 // = 1/n * sum(y²) - (1/n * sum(y))² 405 // ------------- 404 // ------------- --------------- 405 // constant baseline for improvement 406 // 406 407 // if we split into right and left part the overall variance is the weigthed combination nl/n * var(y_l) + nr/n * var(y_r) 407 408 // = nl/n * (1/nl * sum(y_l²) - (1/nl * sum(y_l))²) + nr/n * (1/nr * sum(y_r²) - (1/nr * sum(y_r))²)
Note: See TracChangeset
for help on using the changeset viewer.