Changeset 12632
- Timestamp:
- 07/07/15 11:57:37 (9 years ago)
- Location:
- branches/GBT-trunkintegration
- Files:
-
- 4 added
- 6 edited
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
branches/GBT-trunkintegration/HeuristicLab 3.3.sln
r12587 r12632 12 12 EndProject 13 13 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HeuristicLab.Algorithms.DataAnalysis-3.4", "HeuristicLab.Algorithms.DataAnalysis\3.4\HeuristicLab.Algorithms.DataAnalysis-3.4.csproj", "{2E782078-FA81-4B70-B56F-74CE38DAC6C8}" 14 EndProject 15 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tests", "Tests\Tests.csproj", "{ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}" 14 16 EndProject 15 17 Global … … 35 37 {2E782078-FA81-4B70-B56F-74CE38DAC6C8}.Release|x86.ActiveCfg = Release|x86 36 38 {2E782078-FA81-4B70-B56F-74CE38DAC6C8}.Release|x86.Build.0 = Release|x86 39 {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 40 {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Debug|Any CPU.Build.0 = Debug|Any CPU 41 {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Debug|x64.ActiveCfg = Debug|Any CPU 42 {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Debug|x86.ActiveCfg = Debug|Any CPU 43 {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Release|Any CPU.ActiveCfg = Release|Any CPU 44 {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Release|Any CPU.Build.0 = Release|Any CPU 45 {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Release|x64.ActiveCfg = Release|Any CPU 46 {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Release|x86.ActiveCfg = Release|Any CPU 37 47 EndGlobalSection 38 48 GlobalSection(SolutionProperties) = preSolution -
branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/GradientBoostedTreesAlgorithm.cs
r12620 r12632 49 49 #region ParameterNames 50 50 private const string IterationsParameterName = "Iterations"; 51 private const string Max DepthParameterName = "Maximum Tree Depth";51 private const string MaxSizeParameterName = "Maximum Tree Size"; 52 52 private const string NuParameterName = "Nu"; 53 53 private const string RParameterName = "R"; … … 64 64 get { return (IFixedValueParameter<IntValue>)Parameters[IterationsParameterName]; } 65 65 } 66 public IFixedValueParameter<IntValue> Max DepthParameter {67 get { return (IFixedValueParameter<IntValue>)Parameters[Max DepthParameterName]; }66 public IFixedValueParameter<IntValue> MaxSizeParameter { 67 get { return (IFixedValueParameter<IntValue>)Parameters[MaxSizeParameterName]; } 68 68 } 69 69 public IFixedValueParameter<DoubleValue> NuParameter { … … 106 106 set { SetSeedRandomlyParameter.Value.Value = value; } 107 107 } 108 public int Max Depth{109 get { return Max DepthParameter.Value.Value; }110 set { Max DepthParameter.Value.Value = value; }108 public int MaxSize { 109 get { return MaxSizeParameter.Value.Value; } 110 set { MaxSizeParameter.Value.Value = value; } 111 111 } 112 112 public double Nu { … … 155 155 Parameters.Add(new FixedValueParameter<IntValue>(SeedParameterName, "The random seed used to initialize the new pseudo random number generator.", new IntValue(0))); 156 156 Parameters.Add(new FixedValueParameter<BoolValue>(SetSeedRandomlyParameterName, "True if the random seed should be set to a random value, otherwise false.", new BoolValue(true))); 157 Parameters.Add(new FixedValueParameter<IntValue>(Max DepthParameterName, "Maximal depth of the tree learned in each step (prefer smaller depths if possible)", new IntValue(5)));157 Parameters.Add(new FixedValueParameter<IntValue>(MaxSizeParameterName, "Maximal size of the tree learned in each step (prefer smaller sizes if possible)", new IntValue(10))); 158 158 Parameters.Add(new FixedValueParameter<DoubleValue>(RParameterName, "Ratio of training rows selected randomly in each step (0 < R <= 1)", new DoubleValue(0.5))); 159 159 Parameters.Add(new FixedValueParameter<DoubleValue>(MParameterName, "Ratio of variables selected randomly in each step (0 < M <= 1)", new DoubleValue(0.5))); … … 189 189 var lossFunction = ApplicationManager.Manager.GetInstances<ILossFunction>() 190 190 .Single(l => l.ToString() == LossFunctionParameter.Value.Value); 191 var state = GradientBoostedTreesAlgorithmStatic.CreateGbmState(problemData, lossFunction, (uint)Seed, Max Depth, R, M, Nu);191 var state = GradientBoostedTreesAlgorithmStatic.CreateGbmState(problemData, lossFunction, (uint)Seed, MaxSize, R, M, Nu); 192 192 193 193 var updateInterval = UpdateIntervalParameter.Value.Value; -
branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/GradientBoostedTreesAlgorithmStatic.cs
r12607 r12632 45 45 internal MersenneTwister random { get; private set; } 46 46 internal ILossFunction lossFunction { get; set; } 47 internal int max Depth{ get; set; }47 internal int maxSize { get; set; } 48 48 internal double nu { get; set; } 49 49 internal double r { get; set; } … … 63 63 internal IList<double> weights; 64 64 65 public GbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int max Depth, double r, double m, double nu) {66 // default settings for Max Depth, Nu and R67 this.max Depth = maxDepth;65 public GbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxSize, double r, double m, double nu) { 66 // default settings for MaxSize, Nu and R 67 this.maxSize = maxSize; 68 68 this.nu = nu; 69 69 this.r = r; … … 111 111 public double GetTestLoss() { 112 112 var yTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices); 113 var wTest = yTest.Select(_ => 1.0); // ones114 var nRows = yTest.Count();113 var wTest = problemData.TestIndices.Select(_ => 1.0); // ones 114 var nRows = problemData.TestIndices.Count(); 115 115 return lossFunction.GetLoss(yTest, predTest, wTest) / nRows; 116 116 } … … 118 118 119 119 // simple interface 120 public static IRegressionSolution TrainGbm(IRegressionProblemData problemData, int max Depth, double nu, double r, int maxIterations) {121 return TrainGbm(problemData, new SquaredErrorLoss(), max Depth, nu, r, maxIterations);120 public static IRegressionSolution TrainGbm(IRegressionProblemData problemData, int maxSize, double nu, double r, int maxIterations) { 121 return TrainGbm(problemData, new SquaredErrorLoss(), maxSize, nu, r, maxIterations); 122 122 } 123 123 124 124 // simple interface 125 125 public static IRegressionSolution TrainGbm(IRegressionProblemData problemData, ILossFunction lossFunction, 126 int max Depth, double nu, double r, int maxIterations, uint randSeed = 31415) {126 int maxSize, double nu, double r, int maxIterations, uint randSeed = 31415) { 127 127 Contract.Assert(r > 0); 128 128 Contract.Assert(r <= 1.0); … … 131 131 132 132 var state = (GbmState)CreateGbmState(problemData, lossFunction, randSeed); 133 state.max Depth = maxDepth;133 state.maxSize = maxSize; 134 134 state.r = r; 135 135 state.nu = nu; … … 144 144 145 145 // for custom stepping & termination 146 public static IGbmState CreateGbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int max Depth= 3, double r = 0.66, double m = 0.5, double nu = 0.01) {147 return new GbmState(problemData, lossFunction, randSeed, max Depth, r, m, nu);146 public static IGbmState CreateGbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxSize = 3, double r = 0.66, double m = 0.5, double nu = 0.01) { 147 return new GbmState(problemData, lossFunction, randSeed, maxSize, r, m, nu); 148 148 } 149 149 … … 153 153 if (gbmState == null) throw new ArgumentException("state"); 154 154 155 MakeStep(gbmState, gbmState.max Depth, gbmState.nu, gbmState.r, gbmState.m);155 MakeStep(gbmState, gbmState.maxSize, gbmState.nu, gbmState.r, gbmState.m); 156 156 } 157 157 158 158 // allow dynamic adaptation of maxDepth, nu and r (even though this is not used) 159 public static void MakeStep(IGbmState state, int max Depth, double nu, double r, double m) {159 public static void MakeStep(IGbmState state, int maxSize, double nu, double r, double m) { 160 160 var gbmState = state as GbmState; 161 161 if (gbmState == null) throw new ArgumentException("state"); … … 177 177 } 178 178 179 var tree = treeBuilder.CreateRegressionTreeForGradientBoosting(pseudoRes, max Depth, activeIdx, lossFunction.GetLineSearchFunc(y, yPred, w), r, m);179 var tree = treeBuilder.CreateRegressionTreeForGradientBoosting(pseudoRes, maxSize, activeIdx, lossFunction.GetLineSearchFunc(y, yPred, w), r, m); 180 180 181 181 int i = 0; -
branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/RegressionTreeBuilder.cs
r12623 r12632 54 54 private readonly int[][] sortedIdx; // random selection from sortedIdxAll (for r < 1.0) 55 55 56 private int calls = 0;57 58 56 // helper arrays which are allocated to maximal necessary size only once in the ctor 59 57 private readonly int[] internalIdx, which, leftTmp, rightTmp; … … 64 62 private int curTreeNodeIdx; // the index where the next tree node is stored 65 63 66 private class Partition { 67 public int ParentNodeIdx { get; set; } 68 public int Depth { get; set; } 69 public int StartIdx { get; set; } 70 public int EndIndex { get; set; } 71 public bool Left { get; set; } 72 } 73 private readonly SortedList<double, Partition> queue; 64 // This class represents information about potential splits. 65 // For each node generated the best splitting variable and threshold as well as 66 // the improvement from the split are stored in a priority queue 67 private class PartitionSplits { 68 public int ParentNodeIdx { get; set; } // the idx of the leaf node representing this partition 69 public int StartIdx { get; set; } // the start idx of the partition 70 public int EndIndex { get; set; } // the end idx of the partition 71 public string SplittingVariable { get; set; } // the best splitting variable 72 public double SplittingThreshold { get; set; } // the best threshold 73 public double SplittingImprovement { get; set; } // the improvement of the split (for priority queue) 74 } 75 76 // this list hold partitions with the information about the best split (organized as a sorted queue) 77 private readonly IList<PartitionSplits> queue; 74 78 75 79 // prepare and allocate buffer variables in ctor … … 95 99 outx = new double[rows]; 96 100 outSortedIdx = new int[rows]; 97 queue = new SortedList<double, Partition>();101 queue = new List<PartitionSplits>(100); 98 102 99 103 x = new double[nCols][]; … … 117 121 // r is fraction of rows to use for training 118 122 // m is fraction of variables to use for training 119 public IRegressionModel CreateRegressionTree(int max Depth, double r = 0.5, double m = 0.5) {123 public IRegressionModel CreateRegressionTree(int maxSize, double r = 0.5, double m = 0.5) { 120 124 // subtract mean of y first 121 125 var yAvg = y.Average(); … … 126 130 var ones = Enumerable.Repeat(1.0, y.Length); 127 131 128 var model = CreateRegressionTreeForGradientBoosting(y, max Depth, problemData.TrainingIndices.ToArray(), seLoss.GetLineSearchFunc(y, zeros, ones), r, m);132 var model = CreateRegressionTreeForGradientBoosting(y, maxSize, problemData.TrainingIndices.ToArray(), seLoss.GetLineSearchFunc(y, zeros, ones), r, m); 129 133 130 134 return new GradientBoostedTreesModel(new[] { new ConstantRegressionModel(yAvg), model }, new[] { 1.0, 1.0 }); … … 132 136 133 137 // specific interface that allows to specify the target labels and the training rows which is necessary when for gradient boosted trees 134 public IRegressionModel CreateRegressionTreeForGradientBoosting(double[] y, int max Depth, int[] idx, LineSearchFunc lineSearch, double r = 0.5, double m = 0.5) {135 Debug.Assert(max Depth> 0);138 public IRegressionModel CreateRegressionTreeForGradientBoosting(double[] y, int maxSize, int[] idx, LineSearchFunc lineSearch, double r = 0.5, double m = 0.5) { 139 Debug.Assert(maxSize > 0); 136 140 Debug.Assert(r > 0); 137 141 Debug.Assert(r <= 1.0); … … 174 178 } 175 179 176 // prepare array for the tree nodes (a tree of maxDepth=1 has 1 node, a tree of maxDepth=d has 2^d - 1 nodes) 177 int numNodes = (int)Math.Pow(2, maxDepth) - 1; 178 this.tree = new RegressionTreeModel.TreeNode[numNodes]; 180 this.tree = new RegressionTreeModel.TreeNode[maxSize]; 181 this.queue.Clear(); 179 182 this.curTreeNodeIdx = 0; 180 183 184 // start out with only one leaf node (constant prediction) 185 // and calculate the best split for this root node and enqueue it into a queue sorted by improvement throught the split 181 186 // start and end idx are inclusive 182 queue.Add(calls++, new Partition() { ParentNodeIdx = -1, Depth = maxDepth, StartIdx = 0, EndIndex = effectiveRows - 1 }); 183 CreateRegressionTreeForIdx(lineSearch); 184 185 return new RegressionTreeModel(tree); 186 } 187 188 private void CreateRegressionTreeForIdx(LineSearchFunc lineSearch) { 189 while (queue.Any()) { 190 var f = queue.First().Value; // actually a stack 191 queue.RemoveAt(0); 192 193 var depth = f.Depth; 187 CreateLeafNode(0, effectiveRows - 1, lineSearch); 188 189 // process the priority queue to complete the tree 190 CreateRegressionTreeFromQueue(maxSize, lineSearch); 191 192 return new RegressionTreeModel(tree.ToArray()); 193 } 194 195 196 // processes potential splits from the queue as long as splits are left and the maximum size of the tree is not reached 197 private void CreateRegressionTreeFromQueue(int maxNodes, LineSearchFunc lineSearch) { 198 while (queue.Any() && curTreeNodeIdx + 1 < maxNodes) { // two nodes are created in each loop 199 var f = queue[queue.Count - 1]; // last element has the largest improvement 200 queue.RemoveAt(queue.Count - 1); 201 194 202 var startIdx = f.StartIdx; 195 203 var endIdx = f.EndIndex; … … 199 207 Debug.Assert(endIdx < internalIdx.Length); 200 208 201 double threshold; 202 string bestVariableName; 203 204 // stop when only one row is left or no split is possible 205 if (depth <= 1 || endIdx - startIdx == 0 || !FindBestVariableAndThreshold(startIdx, endIdx, out threshold, out bestVariableName)) { 206 CreateLeafNode(startIdx, endIdx, lineSearch); 207 if (f.ParentNodeIdx >= 0) if (f.Left) { 208 tree[f.ParentNodeIdx].leftIdx = curTreeNodeIdx; 209 } else { 210 tree[f.ParentNodeIdx].rightIdx = curTreeNodeIdx; 211 } 212 curTreeNodeIdx++; 213 } else { 214 int splitIdx; 215 CreateInternalNode(f.StartIdx, f.EndIndex, bestVariableName, threshold, out splitIdx); 216 217 // connect to parent tree 218 if (f.ParentNodeIdx >= 0) if (f.Left) { 219 tree[f.ParentNodeIdx].leftIdx = curTreeNodeIdx; 220 } else { 221 tree[f.ParentNodeIdx].rightIdx = curTreeNodeIdx; 222 } 223 224 Debug.Assert(splitIdx + 1 <= endIdx); 225 Debug.Assert(startIdx <= splitIdx); 226 227 queue.Add(calls++, new Partition() { ParentNodeIdx = curTreeNodeIdx, Left = true, Depth = depth - 1, StartIdx = startIdx, EndIndex = splitIdx }); // left part before right part (stack organization) 228 queue.Add(calls++, new Partition() { ParentNodeIdx = curTreeNodeIdx, Left = false, Depth = depth - 1, StartIdx = splitIdx + 1, EndIndex = endIdx }); 229 curTreeNodeIdx++; 230 231 } 232 } 233 } 234 235 236 private void CreateLeafNode(int startIdx, int endIdx, LineSearchFunc lineSearch) { 237 // max depth reached or only one element 238 tree[curTreeNodeIdx].varName = RegressionTreeModel.TreeNode.NO_VARIABLE; 239 tree[curTreeNodeIdx].val = lineSearch(internalIdx, startIdx, endIdx); 240 } 241 242 // routine for building the tree for the partition of rows stored in internalIdx between startIdx and endIdx 243 // the lineSearch function calculates the optimal prediction value for tree leaf nodes 244 // (in the case of squared errors it is the average of target values for the rows represented by the node) 209 // transform the leaf node into an internal node 210 tree[f.ParentNodeIdx].VarName = f.SplittingVariable; 211 tree[f.ParentNodeIdx].Val = f.SplittingThreshold; 212 213 // split partition into left and right 214 int splitIdx; 215 SplitPartition(f.StartIdx, f.EndIndex, f.SplittingVariable, f.SplittingThreshold, out splitIdx); 216 Debug.Assert(splitIdx + 1 <= endIdx); 217 Debug.Assert(startIdx <= splitIdx); 218 219 // create two leaf nodes (and enqueue best splits for both) 220 tree[f.ParentNodeIdx].LeftIdx = CreateLeafNode(startIdx, splitIdx, lineSearch); 221 tree[f.ParentNodeIdx].RightIdx = CreateLeafNode(splitIdx + 1, endIdx, lineSearch); 222 } 223 } 224 225 226 // returns the index of the newly created tree node 227 private int CreateLeafNode(int startIdx, int endIdx, LineSearchFunc lineSearch) { 228 tree[curTreeNodeIdx].VarName = RegressionTreeModel.TreeNode.NO_VARIABLE; 229 tree[curTreeNodeIdx].Val = lineSearch(internalIdx, startIdx, endIdx); 230 231 EnqueuePartitionSplit(curTreeNodeIdx, startIdx, endIdx); 232 curTreeNodeIdx++; 233 return curTreeNodeIdx - 1; 234 } 235 236 237 // calculates the optimal split for the partition [startIdx .. endIdx] (inclusive) 238 // which is represented by the leaf node with the specified nodeIdx 239 private void EnqueuePartitionSplit(int nodeIdx, int startIdx, int endIdx) { 240 double threshold, improvement; 241 string bestVariableName; 242 // only enqueue a new split if there are at least 2 rows left and a split is possible 243 if (startIdx < endIdx && 244 FindBestVariableAndThreshold(startIdx, endIdx, out threshold, out bestVariableName, out improvement)) { 245 var split = new PartitionSplits() { 246 ParentNodeIdx = nodeIdx, 247 StartIdx = startIdx, 248 EndIndex = endIdx, 249 SplittingThreshold = threshold, 250 SplittingVariable = bestVariableName 251 }; 252 InsertSortedQueue(split); 253 } 254 } 255 256 257 // routine for splitting a partition of rows stored in internalIdx between startIdx and endIdx into 258 // a left partition and a right partition using the given splittingVariable and threshold 259 // the splitIdx is the last index of the left partition 260 // splitIdx + 1 is the first index of the right partition 245 261 // startIdx and endIdx are inclusive 246 private void CreateInternalNode(int startIdx, int endIdx, string splittingVar, double threshold, out int splitIdx) {262 private void SplitPartition(int startIdx, int endIdx, string splittingVar, double threshold, out int splitIdx) { 247 263 int bestVarIdx = varName2Index[splittingVar]; 248 264 // split - two pass … … 303 319 Debug.Assert(startIdx <= j); 304 320 305 tree[curTreeNodeIdx].varName = splittingVar;306 tree[curTreeNodeIdx].val = threshold;307 321 splitIdx = j; 308 322 } 309 323 310 private bool FindBestVariableAndThreshold(int startIdx, int endIdx, out double threshold, out string bestVar ) {324 private bool FindBestVariableAndThreshold(int startIdx, int endIdx, out double threshold, out string bestVar, out double improvement) { 311 325 Debug.Assert(startIdx < endIdx + 1); // at least 2 elements 312 326 … … 345 359 } 346 360 if (bestVar == RegressionTreeModel.TreeNode.NO_VARIABLE) { 347 threshold = bestThreshold; 361 // not successfull 362 threshold = double.PositiveInfinity; 363 improvement = double.NegativeInfinity; 348 364 return false; 349 365 } else { 350 366 UpdateVariableRelevance(bestVar, sumY, bestImprovement, rows); 367 improvement = bestImprovement; 351 368 threshold = bestThreshold; 352 369 return true; 353 370 } 354 }355 356 private void UpdateVariableRelevance(string bestVar, double sumY, double bestImprovement, int rows) {357 if (string.IsNullOrEmpty(bestVar)) return;358 // update variable relevance359 double baseLine = 1.0 / rows * sumY * sumY; // if best improvement is equal to baseline then the split had no effect360 361 double delta = (bestImprovement - baseLine);362 double v;363 if (!sumImprovements.TryGetValue(bestVar, out v)) {364 sumImprovements[bestVar] = delta;365 }366 sumImprovements[bestVar] = v + delta;367 371 } 368 372 … … 429 433 430 434 435 private void UpdateVariableRelevance(string bestVar, double sumY, double bestImprovement, int rows) { 436 if (string.IsNullOrEmpty(bestVar)) return; 437 // update variable relevance 438 double baseLine = 1.0 / rows * sumY * sumY; // if best improvement is equal to baseline then the split had no effect 439 440 double delta = (bestImprovement - baseLine); 441 double v; 442 if (!sumImprovements.TryGetValue(bestVar, out v)) { 443 sumImprovements[bestVar] = delta; 444 } 445 sumImprovements[bestVar] = v + delta; 446 } 447 431 448 public IEnumerable<KeyValuePair<string, double>> GetVariableRelevance() { 432 449 // values are scaled: the most important variable has relevance = 100 … … 437 454 .OrderByDescending(t => t.Value); 438 455 } 456 457 458 // insert a new parition split (find insertion point and start at first element of the queue) 459 // elements are removed from the queue at the last position 460 // O(n), splits could be organized as a heap to improve runtime (see alglib tsort) 461 private void InsertSortedQueue(PartitionSplits split) { 462 // find insertion position 463 int i = 0; 464 while (i < queue.Count && queue[i].SplittingImprovement < split.SplittingImprovement) { i++; } 465 466 queue.Insert(i, split); 467 } 439 468 } 440 469 } -
branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/RegressionTreeModel.cs
r12590 r12632 36 36 public struct TreeNode { 37 37 public readonly static string NO_VARIABLE = string.Empty; 38 public string varName;// name of the variable for splitting or -1 if terminal node39 public double val;// threshold40 public int leftIdx;41 public int rightIdx;38 public string VarName { get; set; } // name of the variable for splitting or -1 if terminal node 39 public double Val { get; set; } // threshold 40 public int LeftIdx { get; set; } 41 public int RightIdx { get; set; } 42 42 43 43 public override int GetHashCode() { 44 return leftIdx ^ rightIdx ^ val.GetHashCode();44 return LeftIdx ^ RightIdx ^ Val.GetHashCode(); 45 45 } 46 46 } … … 64 64 private static double GetPredictionForRow(TreeNode[] t, int nodeIdx, IDataset ds, int row) { 65 65 var node = t[nodeIdx]; 66 if (node. varName == TreeNode.NO_VARIABLE)67 return node. val;68 else if (ds.GetDoubleValue(node. varName, row) <= node.val)69 return GetPredictionForRow(t, node. leftIdx, ds, row);66 if (node.VarName == TreeNode.NO_VARIABLE) 67 return node.Val; 68 else if (ds.GetDoubleValue(node.VarName, row) <= node.Val) 69 return GetPredictionForRow(t, node.LeftIdx, ds, row); 70 70 else 71 return GetPredictionForRow(t, node. rightIdx, ds, row);71 return GetPredictionForRow(t, node.RightIdx, ds, row); 72 72 } 73 73 -
branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/HeuristicLab.Algorithms.DataAnalysis-3.4.csproj
r12620 r12632 199 199 <Private>False</Private> 200 200 </Reference> 201 <Reference Include="Microsoft.VisualStudio.QualityTools.UnitTestFramework, Version=10.1.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />202 201 <Reference Include="System" /> 203 202 <Reference Include="System.Core"> … … 290 289 <Compile Include="GradientBoostedTrees\RegressionTreeBuilder.cs" /> 291 290 <Compile Include="GradientBoostedTrees\RegressionTreeModel.cs" /> 292 <Compile Include="GradientBoostedTrees\Test.cs" />293 291 <Compile Include="Interfaces\IGaussianProcessClassificationModelCreator.cs" /> 294 292 <Compile Include="Interfaces\IGaussianProcessRegressionModelCreator.cs" /> -
branches/GBT-trunkintegration/Tests/Test.cs
r12622 r12632 4 4 using System.Linq; 5 5 using System.Runtime.CompilerServices; 6 using System.Threading; 7 using HeuristicLab.Data; 8 using HeuristicLab.Optimization; 6 9 using HeuristicLab.Problems.DataAnalysis; 7 10 using HeuristicLab.Random; … … 25 28 var allVariables = new string[] { "y", "x1", "x2" }; 26 29 27 // x1 <= 15 -> 128 // x1 > 15 -> 230 // x1 <= 15 -> 2 31 // x1 > 15 -> 1 29 32 BuildTree(xy, allVariables, 10); 30 33 } … … 42 45 43 46 // ignore irrelevant variables 44 // x1 <= 15 -> 145 // x1 > 15 -> 247 // x1 <= 15 -> 2 48 // x1 > 15 -> 1 46 49 BuildTree(xy, allVariables, 10); 47 50 } … … 61 64 // x1 <= 15 AND x2 <= 0 -> 3 62 65 // x1 <= 15 AND x2 > 0 -> 4 63 // x1 > 15 AND x2 <= 0 -> 164 // x1 > 15 AND x2 > 0 -> 266 // x1 > 15 AND x2 <= 0 -> 2 67 // x1 > 15 AND x2 > 0 -> 1 65 68 BuildTree(xy, allVariables, 10); 66 69 } … … 84 87 // x1 <= 15 AND x2 <= 0 -> 3 85 88 // x1 <= 15 AND x2 > 0 -> 4 86 // x1 > 15 AND x2 <= 0 -> 187 // x1 > 15 AND x2 > 0 -> 289 // x1 > 15 AND x2 <= 0 -> 2 90 // x1 > 15 AND x2 > 0 -> 1 88 91 BuildTree(xy, allVariables, 10); 89 92 } … … 103 106 104 107 // split cannot be found 108 // -> 5.50 105 109 BuildTree(xy, allVariables, 3); 106 110 } … … 116 120 117 121 var allVariables = new string[] { "y", "x1", "x2" }; 122 // (two possible solutions) 123 // x2 <= 1.5 -> 5.50 124 // x2 > 1.5 -> 5.55 125 BuildTree(xy, allVariables, 3); 126 118 127 // x1 <= 1.5 AND x2 <= 1.5 -> 10 119 128 // x1 <= 1.5 AND x2 > 1.5 -> 1 120 129 // x1 > 1.5 AND x2 <= 1.5 -> 1 121 130 // x1 > 1.5 AND x2 > 1.5 -> 10.1 122 BuildTree(xy, allVariables, 3);131 BuildTree(xy, allVariables, 7); 123 132 } 124 133 { … … 136 145 // x1 > 1.5 AND x2 <= 1.5 -> 0.9 137 146 // x1 > 1.5 AND x2 > 1.5 -> 1.1 138 BuildTree(xy, allVariables, 3); 139 } 140 141 } 142 147 BuildTree(xy, allVariables, 10); 148 } 149 150 { 151 // unbalanced split 152 var xy = new double[,] 153 { 154 {-1, 1, 1}, 155 {-1, 1, 2}, 156 {-1, 2, 1}, 157 { 1, 2, 2}, 158 }; 159 160 var allVariables = new string[] { "y", "x1", "x2" }; 161 // (two possible solutions) 162 // x2 <= 1.5 -> -1.0 163 // x2 > 1.5 AND x1 <= 1.5 -> -1.0 164 // x2 > 1.5 AND x1 > 1.5 -> 1.0 165 BuildTree(xy, allVariables, 10); 166 } 167 } 168 169 [TestMethod] 170 [TestCategory("Algorithms.DataAnalysis")] 171 [TestProperty("Time", "long")] 172 public void GradientBoostingTestTowerSquaredError() { 173 var gbt = new GradientBoostedTreesAlgorithm(); 174 var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider(); 175 var instance = provider.GetDataDescriptors().Single(x => x.Name.Contains("Tower")); 176 var regProblem = new RegressionProblem(); 177 regProblem.Load(provider.LoadData(instance)); 178 179 #region Algorithm Configuration 180 gbt.Problem = regProblem; 181 gbt.Seed = 0; 182 gbt.SetSeedRandomly = false; 183 gbt.Iterations = 5000; 184 gbt.MaxSize = 20; 185 #endregion 186 187 RunAlgorithm(gbt); 188 189 Assert.AreEqual(267.68704241153921, ((DoubleValue)gbt.Results["Loss (train)"].Value).Value, 1E-6); 190 Assert.AreEqual(393.84704062205469, ((DoubleValue)gbt.Results["Loss (test)"].Value).Value, 1E-6); 191 } 192 193 [TestMethod] 194 [TestCategory("Algorithms.DataAnalysis")] 195 [TestProperty("Time", "long")] 196 public void GradientBoostingTestTowerAbsoluteError() { 197 var gbt = new GradientBoostedTreesAlgorithm(); 198 var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider(); 199 var instance = provider.GetDataDescriptors().Single(x => x.Name.Contains("Tower")); 200 var regProblem = new RegressionProblem(); 201 regProblem.Load(provider.LoadData(instance)); 202 203 #region Algorithm Configuration 204 gbt.Problem = regProblem; 205 gbt.Seed = 0; 206 gbt.SetSeedRandomly = false; 207 gbt.Iterations = 1000; 208 gbt.MaxSize = 20; 209 gbt.Nu = 0.02; 210 gbt.LossFunctionParameter.Value = gbt.LossFunctionParameter.ValidValues.First(l => l.ToString().Contains("Absolute")); 211 #endregion 212 213 RunAlgorithm(gbt); 214 215 Assert.AreEqual(10.551385044666661, ((DoubleValue)gbt.Results["Loss (train)"].Value).Value, 1E-6); 216 Assert.AreEqual(12.918001745581172, ((DoubleValue)gbt.Results["Loss (test)"].Value).Value, 1E-6); 217 } 218 219 [TestMethod] 220 [TestCategory("Algorithms.DataAnalysis")] 221 [TestProperty("Time", "long")] 222 public void GradientBoostingTestTowerRelativeError() { 223 var gbt = new GradientBoostedTreesAlgorithm(); 224 var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider(); 225 var instance = provider.GetDataDescriptors().Single(x => x.Name.Contains("Tower")); 226 var regProblem = new RegressionProblem(); 227 regProblem.Load(provider.LoadData(instance)); 228 229 #region Algorithm Configuration 230 gbt.Problem = regProblem; 231 gbt.Seed = 0; 232 gbt.SetSeedRandomly = false; 233 gbt.Iterations = 3000; 234 gbt.MaxSize = 20; 235 gbt.Nu = 0.005; 236 gbt.LossFunctionParameter.Value = gbt.LossFunctionParameter.ValidValues.First(l => l.ToString().Contains("Relative")); 237 #endregion 238 239 RunAlgorithm(gbt); 240 241 Assert.AreEqual(0.061954221604374943, ((DoubleValue)gbt.Results["Loss (train)"].Value).Value, 1E-6); 242 Assert.AreEqual(0.06316303473499961, ((DoubleValue)gbt.Results["Loss (test)"].Value).Value, 1E-6); 243 } 244 245 // same as in SamplesUtil 246 private void RunAlgorithm(IAlgorithm a) { 247 var trigger = new EventWaitHandle(false, EventResetMode.ManualReset); 248 Exception ex = null; 249 a.Stopped += (src, e) => { trigger.Set(); }; 250 a.ExceptionOccurred += (src, e) => { ex = e.Value; trigger.Set(); }; 251 a.Prepare(); 252 a.Start(); 253 trigger.WaitOne(); 254 255 Assert.AreEqual(ex, null); 256 } 257 258 #region helper 143 259 private void BuildTree(double[,] xy, string[] allVariables, int maxDepth) { 144 260 int nRows = xy.GetLength(0); … … 161 277 private void WriteTree(RegressionTreeModel.TreeNode[] tree, int idx, string partialRule, double offset) { 162 278 var n = tree[idx]; 163 if (n. varName == RegressionTreeModel.TreeNode.NO_VARIABLE) {164 Console.WriteLine("{0} -> {1:F}", partialRule, n. val + offset);279 if (n.VarName == RegressionTreeModel.TreeNode.NO_VARIABLE) { 280 Console.WriteLine("{0} -> {1:F}", partialRule, n.Val + offset); 165 281 } else { 166 WriteTree(tree, n. leftIdx,282 WriteTree(tree, n.LeftIdx, 167 283 string.Format(CultureInfo.InvariantCulture, "{0}{1}{2} <= {3:F}", 168 284 partialRule, 169 285 string.IsNullOrEmpty(partialRule) ? "" : " and ", 170 n. varName,171 n. val), offset);172 WriteTree(tree, n. rightIdx,286 n.VarName, 287 n.Val), offset); 288 WriteTree(tree, n.RightIdx, 173 289 string.Format(CultureInfo.InvariantCulture, "{0}{1}{2} > {3:F}", 174 290 partialRule, 175 291 string.IsNullOrEmpty(partialRule) ? "" : " and ", 176 n.varName, 177 n.val), offset); 178 } 179 } 292 n.VarName, 293 n.Val), offset); 294 } 295 } 296 #endregion 180 297 } 181 298 }
Note: See TracChangeset
for help on using the changeset viewer.