Free cookie consent management tool by TermsFeed Policy Generator

Changeset 12632


Ignore:
Timestamp:
07/07/15 11:57:37 (9 years ago)
Author:
gkronber
Message:

#2261 implemented node expansion using a priority queue (and changed parameter MaxDepth to MaxSize). Moved unit tests to a separate project.

Location:
branches/GBT-trunkintegration
Files:
4 added
6 edited
1 moved

Legend:

Unmodified
Added
Removed
  • branches/GBT-trunkintegration/HeuristicLab 3.3.sln

    r12587 r12632  
    1212EndProject
    1313Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HeuristicLab.Algorithms.DataAnalysis-3.4", "HeuristicLab.Algorithms.DataAnalysis\3.4\HeuristicLab.Algorithms.DataAnalysis-3.4.csproj", "{2E782078-FA81-4B70-B56F-74CE38DAC6C8}"
     14EndProject
     15Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Tests", "Tests\Tests.csproj", "{ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}"
    1416EndProject
    1517Global
     
    3537    {2E782078-FA81-4B70-B56F-74CE38DAC6C8}.Release|x86.ActiveCfg = Release|x86
    3638    {2E782078-FA81-4B70-B56F-74CE38DAC6C8}.Release|x86.Build.0 = Release|x86
     39    {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
     40    {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Debug|Any CPU.Build.0 = Debug|Any CPU
     41    {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Debug|x64.ActiveCfg = Debug|Any CPU
     42    {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Debug|x86.ActiveCfg = Debug|Any CPU
     43    {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Release|Any CPU.ActiveCfg = Release|Any CPU
     44    {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Release|Any CPU.Build.0 = Release|Any CPU
     45    {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Release|x64.ActiveCfg = Release|Any CPU
     46    {ABBC647A-B54B-4DF3-9F64-DBD11AD46A15}.Release|x86.ActiveCfg = Release|Any CPU
    3747  EndGlobalSection
    3848  GlobalSection(SolutionProperties) = preSolution
  • branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/GradientBoostedTreesAlgorithm.cs

    r12620 r12632  
    4949    #region ParameterNames
    5050    private const string IterationsParameterName = "Iterations";
    51     private const string MaxDepthParameterName = "Maximum Tree Depth";
     51    private const string MaxSizeParameterName = "Maximum Tree Size";
    5252    private const string NuParameterName = "Nu";
    5353    private const string RParameterName = "R";
     
    6464      get { return (IFixedValueParameter<IntValue>)Parameters[IterationsParameterName]; }
    6565    }
    66     public IFixedValueParameter<IntValue> MaxDepthParameter {
    67       get { return (IFixedValueParameter<IntValue>)Parameters[MaxDepthParameterName]; }
     66    public IFixedValueParameter<IntValue> MaxSizeParameter {
     67      get { return (IFixedValueParameter<IntValue>)Parameters[MaxSizeParameterName]; }
    6868    }
    6969    public IFixedValueParameter<DoubleValue> NuParameter {
     
    106106      set { SetSeedRandomlyParameter.Value.Value = value; }
    107107    }
    108     public int MaxDepth {
    109       get { return MaxDepthParameter.Value.Value; }
    110       set { MaxDepthParameter.Value.Value = value; }
     108    public int MaxSize {
     109      get { return MaxSizeParameter.Value.Value; }
     110      set { MaxSizeParameter.Value.Value = value; }
    111111    }
    112112    public double Nu {
     
    155155      Parameters.Add(new FixedValueParameter<IntValue>(SeedParameterName, "The random seed used to initialize the new pseudo random number generator.", new IntValue(0)));
    156156      Parameters.Add(new FixedValueParameter<BoolValue>(SetSeedRandomlyParameterName, "True if the random seed should be set to a random value, otherwise false.", new BoolValue(true)));
    157       Parameters.Add(new FixedValueParameter<IntValue>(MaxDepthParameterName, "Maximal depth of the tree learned in each step (prefer smaller depths if possible)", new IntValue(5)));
     157      Parameters.Add(new FixedValueParameter<IntValue>(MaxSizeParameterName, "Maximal size of the tree learned in each step (prefer smaller sizes if possible)", new IntValue(10)));
    158158      Parameters.Add(new FixedValueParameter<DoubleValue>(RParameterName, "Ratio of training rows selected randomly in each step (0 < R <= 1)", new DoubleValue(0.5)));
    159159      Parameters.Add(new FixedValueParameter<DoubleValue>(MParameterName, "Ratio of variables selected randomly in each step (0 < M <= 1)", new DoubleValue(0.5)));
     
    189189      var lossFunction = ApplicationManager.Manager.GetInstances<ILossFunction>()
    190190        .Single(l => l.ToString() == LossFunctionParameter.Value.Value);
    191       var state = GradientBoostedTreesAlgorithmStatic.CreateGbmState(problemData, lossFunction, (uint)Seed, MaxDepth, R, M, Nu);
     191      var state = GradientBoostedTreesAlgorithmStatic.CreateGbmState(problemData, lossFunction, (uint)Seed, MaxSize, R, M, Nu);
    192192
    193193      var updateInterval = UpdateIntervalParameter.Value.Value;
  • branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/GradientBoostedTreesAlgorithmStatic.cs

    r12607 r12632  
    4545      internal MersenneTwister random { get; private set; }
    4646      internal ILossFunction lossFunction { get; set; }
    47       internal int maxDepth { get; set; }
     47      internal int maxSize { get; set; }
    4848      internal double nu { get; set; }
    4949      internal double r { get; set; }
     
    6363      internal IList<double> weights;
    6464
    65       public GbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxDepth, double r, double m, double nu) {
    66         // default settings for MaxDepth, Nu and R
    67         this.maxDepth = maxDepth;
     65      public GbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxSize, double r, double m, double nu) {
     66        // default settings for MaxSize, Nu and R
     67        this.maxSize = maxSize;
    6868        this.nu = nu;
    6969        this.r = r;
     
    111111      public double GetTestLoss() {
    112112        var yTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices);
    113         var wTest = yTest.Select(_ => 1.0); // ones
    114         var nRows = yTest.Count();
     113        var wTest = problemData.TestIndices.Select(_ => 1.0); // ones
     114        var nRows = problemData.TestIndices.Count();
    115115        return lossFunction.GetLoss(yTest, predTest, wTest) / nRows;
    116116      }
     
    118118
    119119    // simple interface
    120     public static IRegressionSolution TrainGbm(IRegressionProblemData problemData, int maxDepth, double nu, double r, int maxIterations) {
    121       return TrainGbm(problemData, new SquaredErrorLoss(), maxDepth, nu, r, maxIterations);
     120    public static IRegressionSolution TrainGbm(IRegressionProblemData problemData, int maxSize, double nu, double r, int maxIterations) {
     121      return TrainGbm(problemData, new SquaredErrorLoss(), maxSize, nu, r, maxIterations);
    122122    }
    123123
    124124    // simple interface
    125125    public static IRegressionSolution TrainGbm(IRegressionProblemData problemData, ILossFunction lossFunction,
    126       int maxDepth, double nu, double r, int maxIterations, uint randSeed = 31415) {
     126      int maxSize, double nu, double r, int maxIterations, uint randSeed = 31415) {
    127127      Contract.Assert(r > 0);
    128128      Contract.Assert(r <= 1.0);
     
    131131
    132132      var state = (GbmState)CreateGbmState(problemData, lossFunction, randSeed);
    133       state.maxDepth = maxDepth;
     133      state.maxSize = maxSize;
    134134      state.r = r;
    135135      state.nu = nu;
     
    144144
    145145    // for custom stepping & termination
    146     public static IGbmState CreateGbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxDepth = 3, double r = 0.66, double m = 0.5, double nu = 0.01) {
    147       return new GbmState(problemData, lossFunction, randSeed, maxDepth, r, m, nu);
     146    public static IGbmState CreateGbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxSize = 3, double r = 0.66, double m = 0.5, double nu = 0.01) {
     147      return new GbmState(problemData, lossFunction, randSeed, maxSize, r, m, nu);
    148148    }
    149149
     
    153153      if (gbmState == null) throw new ArgumentException("state");
    154154
    155       MakeStep(gbmState, gbmState.maxDepth, gbmState.nu, gbmState.r, gbmState.m);
     155      MakeStep(gbmState, gbmState.maxSize, gbmState.nu, gbmState.r, gbmState.m);
    156156    }
    157157
    158158    // allow dynamic adaptation of maxDepth, nu and r (even though this is not used)
    159     public static void MakeStep(IGbmState state, int maxDepth, double nu, double r, double m) {
     159    public static void MakeStep(IGbmState state, int maxSize, double nu, double r, double m) {
    160160      var gbmState = state as GbmState;
    161161      if (gbmState == null) throw new ArgumentException("state");
     
    177177      }
    178178
    179       var tree = treeBuilder.CreateRegressionTreeForGradientBoosting(pseudoRes, maxDepth, activeIdx, lossFunction.GetLineSearchFunc(y, yPred, w), r, m);
     179      var tree = treeBuilder.CreateRegressionTreeForGradientBoosting(pseudoRes, maxSize, activeIdx, lossFunction.GetLineSearchFunc(y, yPred, w), r, m);
    180180
    181181      int i = 0;
  • branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/RegressionTreeBuilder.cs

    r12623 r12632  
    5454    private readonly int[][] sortedIdx; // random selection from sortedIdxAll (for r < 1.0)
    5555
    56     private int calls = 0;
    57 
    5856    // helper arrays which are allocated to maximal necessary size only once in the ctor
    5957    private readonly int[] internalIdx, which, leftTmp, rightTmp;
     
    6462    private int curTreeNodeIdx; // the index where the next tree node is stored
    6563
    66     private class Partition {
    67       public int ParentNodeIdx { get; set; }
    68       public int Depth { get; set; }
    69       public int StartIdx { get; set; }
    70       public int EndIndex { get; set; }
    71       public bool Left { get; set; }
    72     }
    73     private readonly SortedList<double, Partition> queue;
     64    // This class represents information about potential splits.
     65    // For each node generated the best splitting variable and threshold as well as
     66    // the improvement from the split are stored in a priority queue
     67    private class PartitionSplits {
     68      public int ParentNodeIdx { get; set; } // the idx of the leaf node representing this partition
     69      public int StartIdx { get; set; } // the start idx of the partition
     70      public int EndIndex { get; set; } // the end idx of the partition
     71      public string SplittingVariable { get; set; } // the best splitting variable
     72      public double SplittingThreshold { get; set; } // the best threshold
     73      public double SplittingImprovement { get; set; } // the improvement of the split (for priority queue)
     74    }
     75
     76    // this list hold partitions with the information about the best split (organized as a sorted queue)
     77    private readonly IList<PartitionSplits> queue;
    7478
    7579    // prepare and allocate buffer variables in ctor
     
    9599      outx = new double[rows];
    96100      outSortedIdx = new int[rows];
    97       queue = new SortedList<double, Partition>();
     101      queue = new List<PartitionSplits>(100);
    98102
    99103      x = new double[nCols][];
     
    117121    // r is fraction of rows to use for training
    118122    // m is fraction of variables to use for training
    119     public IRegressionModel CreateRegressionTree(int maxDepth, double r = 0.5, double m = 0.5) {
     123    public IRegressionModel CreateRegressionTree(int maxSize, double r = 0.5, double m = 0.5) {
    120124      // subtract mean of y first
    121125      var yAvg = y.Average();
     
    126130      var ones = Enumerable.Repeat(1.0, y.Length);
    127131
    128       var model = CreateRegressionTreeForGradientBoosting(y, maxDepth, problemData.TrainingIndices.ToArray(), seLoss.GetLineSearchFunc(y, zeros, ones), r, m);
     132      var model = CreateRegressionTreeForGradientBoosting(y, maxSize, problemData.TrainingIndices.ToArray(), seLoss.GetLineSearchFunc(y, zeros, ones), r, m);
    129133
    130134      return new GradientBoostedTreesModel(new[] { new ConstantRegressionModel(yAvg), model }, new[] { 1.0, 1.0 });
     
    132136
    133137    // specific interface that allows to specify the target labels and the training rows which is necessary when for gradient boosted trees
    134     public IRegressionModel CreateRegressionTreeForGradientBoosting(double[] y, int maxDepth, int[] idx, LineSearchFunc lineSearch, double r = 0.5, double m = 0.5) {
    135       Debug.Assert(maxDepth > 0);
     138    public IRegressionModel CreateRegressionTreeForGradientBoosting(double[] y, int maxSize, int[] idx, LineSearchFunc lineSearch, double r = 0.5, double m = 0.5) {
     139      Debug.Assert(maxSize > 0);
    136140      Debug.Assert(r > 0);
    137141      Debug.Assert(r <= 1.0);
     
    174178      }
    175179
    176       // prepare array for the tree nodes (a tree of maxDepth=1 has 1 node, a tree of maxDepth=d has 2^d - 1 nodes)     
    177       int numNodes = (int)Math.Pow(2, maxDepth) - 1;
    178       this.tree = new RegressionTreeModel.TreeNode[numNodes];
     180      this.tree = new RegressionTreeModel.TreeNode[maxSize];
     181      this.queue.Clear();
    179182      this.curTreeNodeIdx = 0;
    180183
     184      // start out with only one leaf node (constant prediction)
     185      // and calculate the best split for this root node and enqueue it into a queue sorted by improvement throught the split
    181186      // start and end idx are inclusive
    182       queue.Add(calls++, new Partition() { ParentNodeIdx = -1, Depth = maxDepth, StartIdx = 0, EndIndex = effectiveRows - 1 });
    183       CreateRegressionTreeForIdx(lineSearch);
    184 
    185       return new RegressionTreeModel(tree);
    186     }
    187 
    188     private void CreateRegressionTreeForIdx(LineSearchFunc lineSearch) {
    189       while (queue.Any()) {
    190         var f = queue.First().Value; // actually a stack
    191         queue.RemoveAt(0);
    192 
    193         var depth = f.Depth;
     187      CreateLeafNode(0, effectiveRows - 1, lineSearch);
     188
     189      // process the priority queue to complete the tree
     190      CreateRegressionTreeFromQueue(maxSize, lineSearch);
     191
     192      return new RegressionTreeModel(tree.ToArray());
     193    }
     194
     195
     196    // processes potential splits from the queue as long as splits are left and the maximum size of the tree is not reached
     197    private void CreateRegressionTreeFromQueue(int maxNodes, LineSearchFunc lineSearch) {
     198      while (queue.Any() && curTreeNodeIdx + 1 < maxNodes) { // two nodes are created in each loop
     199        var f = queue[queue.Count - 1]; // last element has the largest improvement
     200        queue.RemoveAt(queue.Count - 1);
     201
    194202        var startIdx = f.StartIdx;
    195203        var endIdx = f.EndIndex;
     
    199207        Debug.Assert(endIdx < internalIdx.Length);
    200208
    201         double threshold;
    202         string bestVariableName;
    203 
    204         // stop when only one row is left or no split is possible
    205         if (depth <= 1 || endIdx - startIdx == 0 || !FindBestVariableAndThreshold(startIdx, endIdx, out threshold, out bestVariableName)) {
    206           CreateLeafNode(startIdx, endIdx, lineSearch);
    207           if (f.ParentNodeIdx >= 0) if (f.Left) {
    208               tree[f.ParentNodeIdx].leftIdx = curTreeNodeIdx;
    209             } else {
    210               tree[f.ParentNodeIdx].rightIdx = curTreeNodeIdx;
    211             }
    212           curTreeNodeIdx++;
    213         } else {
    214           int splitIdx;
    215           CreateInternalNode(f.StartIdx, f.EndIndex, bestVariableName, threshold, out splitIdx);
    216 
    217           // connect to parent tree
    218           if (f.ParentNodeIdx >= 0) if (f.Left) {
    219               tree[f.ParentNodeIdx].leftIdx = curTreeNodeIdx;
    220             } else {
    221               tree[f.ParentNodeIdx].rightIdx = curTreeNodeIdx;
    222             }
    223 
    224           Debug.Assert(splitIdx + 1 <= endIdx);
    225           Debug.Assert(startIdx <= splitIdx);
    226 
    227           queue.Add(calls++, new Partition() { ParentNodeIdx = curTreeNodeIdx, Left = true, Depth = depth - 1, StartIdx = startIdx, EndIndex = splitIdx }); // left part before right part (stack organization)
    228           queue.Add(calls++, new Partition() { ParentNodeIdx = curTreeNodeIdx, Left = false, Depth = depth - 1, StartIdx = splitIdx + 1, EndIndex = endIdx });
    229           curTreeNodeIdx++;
    230 
    231         }
    232       }
    233     }
    234 
    235 
    236     private void CreateLeafNode(int startIdx, int endIdx, LineSearchFunc lineSearch) {
    237       // max depth reached or only one element   
    238       tree[curTreeNodeIdx].varName = RegressionTreeModel.TreeNode.NO_VARIABLE;
    239       tree[curTreeNodeIdx].val = lineSearch(internalIdx, startIdx, endIdx);
    240     }
    241 
    242     // routine for building the tree for the partition of rows stored in internalIdx between startIdx and endIdx
    243     // the lineSearch function calculates the optimal prediction value for tree leaf nodes
    244     // (in the case of squared errors it is the average of target values for the rows represented by the node)
     209        // transform the leaf node into an internal node
     210        tree[f.ParentNodeIdx].VarName = f.SplittingVariable;
     211        tree[f.ParentNodeIdx].Val = f.SplittingThreshold;
     212
     213        // split partition into left and right
     214        int splitIdx;
     215        SplitPartition(f.StartIdx, f.EndIndex, f.SplittingVariable, f.SplittingThreshold, out splitIdx);
     216        Debug.Assert(splitIdx + 1 <= endIdx);
     217        Debug.Assert(startIdx <= splitIdx);
     218
     219        // create two leaf nodes (and enqueue best splits for both)
     220        tree[f.ParentNodeIdx].LeftIdx = CreateLeafNode(startIdx, splitIdx, lineSearch);
     221        tree[f.ParentNodeIdx].RightIdx = CreateLeafNode(splitIdx + 1, endIdx, lineSearch);
     222      }
     223    }
     224
     225
     226    // returns the index of the newly created tree node
     227    private int CreateLeafNode(int startIdx, int endIdx, LineSearchFunc lineSearch) {
     228      tree[curTreeNodeIdx].VarName = RegressionTreeModel.TreeNode.NO_VARIABLE;
     229      tree[curTreeNodeIdx].Val = lineSearch(internalIdx, startIdx, endIdx);
     230
     231      EnqueuePartitionSplit(curTreeNodeIdx, startIdx, endIdx);
     232      curTreeNodeIdx++;
     233      return curTreeNodeIdx - 1;
     234    }
     235
     236
     237    // calculates the optimal split for the partition [startIdx .. endIdx] (inclusive)
     238    // which is represented by the leaf node with the specified nodeIdx
     239    private void EnqueuePartitionSplit(int nodeIdx, int startIdx, int endIdx) {
     240      double threshold, improvement;
     241      string bestVariableName;
     242      // only enqueue a new split if there are at least 2 rows left and a split is possible
     243      if (startIdx < endIdx &&
     244        FindBestVariableAndThreshold(startIdx, endIdx, out threshold, out bestVariableName, out improvement)) {
     245        var split = new PartitionSplits() {
     246          ParentNodeIdx = nodeIdx,
     247          StartIdx = startIdx,
     248          EndIndex = endIdx,
     249          SplittingThreshold = threshold,
     250          SplittingVariable = bestVariableName
     251        };
     252        InsertSortedQueue(split);
     253      }
     254    }
     255
     256
     257    // routine for splitting a partition of rows stored in internalIdx between startIdx and endIdx into
     258    // a left partition and a right partition using the given splittingVariable and threshold
     259    // the splitIdx is the last index of the left partition
     260    // splitIdx + 1 is the first index of the right partition
    245261    // startIdx and endIdx are inclusive
    246     private void CreateInternalNode(int startIdx, int endIdx, string splittingVar, double threshold, out int splitIdx) {
     262    private void SplitPartition(int startIdx, int endIdx, string splittingVar, double threshold, out int splitIdx) {
    247263      int bestVarIdx = varName2Index[splittingVar];
    248264      // split - two pass
     
    303319      Debug.Assert(startIdx <= j);
    304320
    305       tree[curTreeNodeIdx].varName = splittingVar;
    306       tree[curTreeNodeIdx].val = threshold;
    307321      splitIdx = j;
    308322    }
    309323
    310     private bool FindBestVariableAndThreshold(int startIdx, int endIdx, out double threshold, out string bestVar) {
     324    private bool FindBestVariableAndThreshold(int startIdx, int endIdx, out double threshold, out string bestVar, out double improvement) {
    311325      Debug.Assert(startIdx < endIdx + 1); // at least 2 elements
    312326
     
    345359      }
    346360      if (bestVar == RegressionTreeModel.TreeNode.NO_VARIABLE) {
    347         threshold = bestThreshold;
     361        // not successfull
     362        threshold = double.PositiveInfinity;
     363        improvement = double.NegativeInfinity;
    348364        return false;
    349365      } else {
    350366        UpdateVariableRelevance(bestVar, sumY, bestImprovement, rows);
     367        improvement = bestImprovement;
    351368        threshold = bestThreshold;
    352369        return true;
    353370      }
    354     }
    355 
    356     private void UpdateVariableRelevance(string bestVar, double sumY, double bestImprovement, int rows) {
    357       if (string.IsNullOrEmpty(bestVar)) return;
    358       // update variable relevance
    359       double baseLine = 1.0 / rows * sumY * sumY; // if best improvement is equal to baseline then the split had no effect
    360 
    361       double delta = (bestImprovement - baseLine);
    362       double v;
    363       if (!sumImprovements.TryGetValue(bestVar, out v)) {
    364         sumImprovements[bestVar] = delta;
    365       }
    366       sumImprovements[bestVar] = v + delta;
    367371    }
    368372
     
    429433
    430434
     435    private void UpdateVariableRelevance(string bestVar, double sumY, double bestImprovement, int rows) {
     436      if (string.IsNullOrEmpty(bestVar)) return;
     437      // update variable relevance
     438      double baseLine = 1.0 / rows * sumY * sumY; // if best improvement is equal to baseline then the split had no effect
     439
     440      double delta = (bestImprovement - baseLine);
     441      double v;
     442      if (!sumImprovements.TryGetValue(bestVar, out v)) {
     443        sumImprovements[bestVar] = delta;
     444      }
     445      sumImprovements[bestVar] = v + delta;
     446    }
     447
    431448    public IEnumerable<KeyValuePair<string, double>> GetVariableRelevance() {
    432449      // values are scaled: the most important variable has relevance = 100
     
    437454        .OrderByDescending(t => t.Value);
    438455    }
     456
     457
     458    // insert a new parition split (find insertion point and start at first element of the queue)
     459    // elements are removed from the queue at the last position
     460    // O(n), splits could be organized as a heap to improve runtime (see alglib tsort)
     461    private void InsertSortedQueue(PartitionSplits split) {
     462      // find insertion position
     463      int i = 0;
     464      while (i < queue.Count && queue[i].SplittingImprovement < split.SplittingImprovement) { i++; }
     465
     466      queue.Insert(i, split);
     467    }
    439468  }
    440469}
  • branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/RegressionTreeModel.cs

    r12590 r12632  
    3636    public struct TreeNode {
    3737      public readonly static string NO_VARIABLE = string.Empty;
    38       public string varName; // name of the variable for splitting or -1 if terminal node
    39       public double val; // threshold
    40       public int leftIdx;
    41       public int rightIdx;
     38      public string VarName { get; set; } // name of the variable for splitting or -1 if terminal node
     39      public double Val { get; set; } // threshold
     40      public int LeftIdx { get; set; }
     41      public int RightIdx { get; set; }
    4242
    4343      public override int GetHashCode() {
    44         return leftIdx ^ rightIdx ^ val.GetHashCode();
     44        return LeftIdx ^ RightIdx ^ Val.GetHashCode();
    4545      }
    4646    }
     
    6464    private static double GetPredictionForRow(TreeNode[] t, int nodeIdx, IDataset ds, int row) {
    6565      var node = t[nodeIdx];
    66       if (node.varName == TreeNode.NO_VARIABLE)
    67         return node.val;
    68       else if (ds.GetDoubleValue(node.varName, row) <= node.val)
    69         return GetPredictionForRow(t, node.leftIdx, ds, row);
     66      if (node.VarName == TreeNode.NO_VARIABLE)
     67        return node.Val;
     68      else if (ds.GetDoubleValue(node.VarName, row) <= node.Val)
     69        return GetPredictionForRow(t, node.LeftIdx, ds, row);
    7070      else
    71         return GetPredictionForRow(t, node.rightIdx, ds, row);
     71        return GetPredictionForRow(t, node.RightIdx, ds, row);
    7272    }
    7373
  • branches/GBT-trunkintegration/HeuristicLab.Algorithms.DataAnalysis/3.4/HeuristicLab.Algorithms.DataAnalysis-3.4.csproj

    r12620 r12632  
    199199      <Private>False</Private>
    200200    </Reference>
    201     <Reference Include="Microsoft.VisualStudio.QualityTools.UnitTestFramework, Version=10.1.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL" />
    202201    <Reference Include="System" />
    203202    <Reference Include="System.Core">
     
    290289    <Compile Include="GradientBoostedTrees\RegressionTreeBuilder.cs" />
    291290    <Compile Include="GradientBoostedTrees\RegressionTreeModel.cs" />
    292     <Compile Include="GradientBoostedTrees\Test.cs" />
    293291    <Compile Include="Interfaces\IGaussianProcessClassificationModelCreator.cs" />
    294292    <Compile Include="Interfaces\IGaussianProcessRegressionModelCreator.cs" />
  • branches/GBT-trunkintegration/Tests/Test.cs

    r12622 r12632  
    44using System.Linq;
    55using System.Runtime.CompilerServices;
     6using System.Threading;
     7using HeuristicLab.Data;
     8using HeuristicLab.Optimization;
    69using HeuristicLab.Problems.DataAnalysis;
    710using HeuristicLab.Random;
     
    2528        var allVariables = new string[] { "y", "x1", "x2" };
    2629
    27         // x1 <= 15 -> 1
    28         // x1 >  15 -> 2
     30        // x1 <= 15 -> 2
     31        // x1 >  15 -> 1
    2932        BuildTree(xy, allVariables, 10);
    3033      }
     
    4245
    4346        // ignore irrelevant variables
    44         // x1 <= 15 -> 1
    45         // x1 >  15 -> 2
     47        // x1 <= 15 -> 2
     48        // x1 >  15 -> 1
    4649        BuildTree(xy, allVariables, 10);
    4750      }
     
    6164        // x1 <= 15 AND x2 <= 0 -> 3
    6265        // x1 <= 15 AND x2 >  0 -> 4
    63         // x1 >  15 AND x2 <= 0 -> 1
    64         // x1 >  15 AND x2 >  0 -> 2
     66        // x1 >  15 AND x2 <= 0 -> 2
     67        // x1 >  15 AND x2 >  0 -> 1
    6568        BuildTree(xy, allVariables, 10);
    6669      }
     
    8487        // x1 <= 15 AND x2 <= 0 -> 3
    8588        // x1 <= 15 AND x2 >  0 -> 4
    86         // x1 >  15 AND x2 <= 0 -> 1
    87         // x1 >  15 AND x2 >  0 -> 2
     89        // x1 >  15 AND x2 <= 0 -> 2
     90        // x1 >  15 AND x2 >  0 -> 1
    8891        BuildTree(xy, allVariables, 10);
    8992      }
     
    103106
    104107        // split cannot be found
     108        // -> 5.50
    105109        BuildTree(xy, allVariables, 3);
    106110      }
     
    116120
    117121        var allVariables = new string[] { "y", "x1", "x2" };
     122        // (two possible solutions)
     123        // x2 <= 1.5 -> 5.50
     124        // x2 >  1.5 -> 5.55
     125        BuildTree(xy, allVariables, 3);
     126
    118127        // x1 <= 1.5 AND x2 <= 1.5 -> 10
    119128        // x1 <= 1.5 AND x2 >  1.5 -> 1
    120129        // x1 >  1.5 AND x2 <= 1.5 -> 1
    121130        // x1 >  1.5 AND x2 >  1.5 -> 10.1
    122         BuildTree(xy, allVariables, 3);
     131        BuildTree(xy, allVariables, 7);
    123132      }
    124133      {
     
    136145        // x1 >  1.5 AND x2 <= 1.5 -> 0.9
    137146        // x1 >  1.5 AND x2 >  1.5 -> 1.1
    138         BuildTree(xy, allVariables, 3);
    139       }
    140 
    141     }
    142 
     147        BuildTree(xy, allVariables, 10);
     148      }
     149
     150      {
     151        // unbalanced split
     152        var xy = new double[,]
     153        {
     154          {-1, 1, 1},
     155          {-1, 1, 2},
     156          {-1, 2, 1},
     157          { 1, 2, 2},
     158        };
     159
     160        var allVariables = new string[] { "y", "x1", "x2" };
     161        // (two possible solutions)
     162        // x2 <= 1.5 -> -1.0
     163        // x2 >  1.5 AND x1 <= 1.5 -> -1.0
     164        // x2 >  1.5 AND x1 >  1.5 ->  1.0
     165        BuildTree(xy, allVariables, 10);
     166      }
     167    }
     168
     169    [TestMethod]
     170    [TestCategory("Algorithms.DataAnalysis")]
     171    [TestProperty("Time", "long")]
     172    public void GradientBoostingTestTowerSquaredError() {
     173      var gbt = new GradientBoostedTreesAlgorithm();
     174      var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider();
     175      var instance = provider.GetDataDescriptors().Single(x => x.Name.Contains("Tower"));
     176      var regProblem = new RegressionProblem();
     177      regProblem.Load(provider.LoadData(instance));
     178
     179      #region Algorithm Configuration
     180      gbt.Problem = regProblem;
     181      gbt.Seed = 0;
     182      gbt.SetSeedRandomly = false;
     183      gbt.Iterations = 5000;
     184      gbt.MaxSize = 20;
     185      #endregion
     186
     187      RunAlgorithm(gbt);
     188
     189      Assert.AreEqual(267.68704241153921, ((DoubleValue)gbt.Results["Loss (train)"].Value).Value, 1E-6);
     190      Assert.AreEqual(393.84704062205469, ((DoubleValue)gbt.Results["Loss (test)"].Value).Value, 1E-6);
     191    }
     192
     193    [TestMethod]
     194    [TestCategory("Algorithms.DataAnalysis")]
     195    [TestProperty("Time", "long")]
     196    public void GradientBoostingTestTowerAbsoluteError() {
     197      var gbt = new GradientBoostedTreesAlgorithm();
     198      var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider();
     199      var instance = provider.GetDataDescriptors().Single(x => x.Name.Contains("Tower"));
     200      var regProblem = new RegressionProblem();
     201      regProblem.Load(provider.LoadData(instance));
     202
     203      #region Algorithm Configuration
     204      gbt.Problem = regProblem;
     205      gbt.Seed = 0;
     206      gbt.SetSeedRandomly = false;
     207      gbt.Iterations = 1000;
     208      gbt.MaxSize = 20;
     209      gbt.Nu = 0.02;
     210      gbt.LossFunctionParameter.Value = gbt.LossFunctionParameter.ValidValues.First(l => l.ToString().Contains("Absolute"));
     211      #endregion
     212
     213      RunAlgorithm(gbt);
     214
     215      Assert.AreEqual(10.551385044666661, ((DoubleValue)gbt.Results["Loss (train)"].Value).Value, 1E-6);
     216      Assert.AreEqual(12.918001745581172, ((DoubleValue)gbt.Results["Loss (test)"].Value).Value, 1E-6);
     217    }
     218
     219    [TestMethod]
     220    [TestCategory("Algorithms.DataAnalysis")]
     221    [TestProperty("Time", "long")]
     222    public void GradientBoostingTestTowerRelativeError() {
     223      var gbt = new GradientBoostedTreesAlgorithm();
     224      var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider();
     225      var instance = provider.GetDataDescriptors().Single(x => x.Name.Contains("Tower"));
     226      var regProblem = new RegressionProblem();
     227      regProblem.Load(provider.LoadData(instance));
     228
     229      #region Algorithm Configuration
     230      gbt.Problem = regProblem;
     231      gbt.Seed = 0;
     232      gbt.SetSeedRandomly = false;
     233      gbt.Iterations = 3000;
     234      gbt.MaxSize = 20;
     235      gbt.Nu = 0.005;
     236      gbt.LossFunctionParameter.Value = gbt.LossFunctionParameter.ValidValues.First(l => l.ToString().Contains("Relative"));
     237      #endregion
     238
     239      RunAlgorithm(gbt);
     240
     241      Assert.AreEqual(0.061954221604374943, ((DoubleValue)gbt.Results["Loss (train)"].Value).Value, 1E-6);
     242      Assert.AreEqual(0.06316303473499961, ((DoubleValue)gbt.Results["Loss (test)"].Value).Value, 1E-6);
     243    }
     244
     245    // same as in SamplesUtil
     246    private void RunAlgorithm(IAlgorithm a) {
     247      var trigger = new EventWaitHandle(false, EventResetMode.ManualReset);
     248      Exception ex = null;
     249      a.Stopped += (src, e) => { trigger.Set(); };
     250      a.ExceptionOccurred += (src, e) => { ex = e.Value; trigger.Set(); };
     251      a.Prepare();
     252      a.Start();
     253      trigger.WaitOne();
     254
     255      Assert.AreEqual(ex, null);
     256    }
     257
     258    #region helper
    143259    private void BuildTree(double[,] xy, string[] allVariables, int maxDepth) {
    144260      int nRows = xy.GetLength(0);
     
    161277    private void WriteTree(RegressionTreeModel.TreeNode[] tree, int idx, string partialRule, double offset) {
    162278      var n = tree[idx];
    163       if (n.varName == RegressionTreeModel.TreeNode.NO_VARIABLE) {
    164         Console.WriteLine("{0} -> {1:F}", partialRule, n.val + offset);
     279      if (n.VarName == RegressionTreeModel.TreeNode.NO_VARIABLE) {
     280        Console.WriteLine("{0} -> {1:F}", partialRule, n.Val + offset);
    165281      } else {
    166         WriteTree(tree, n.leftIdx,
     282        WriteTree(tree, n.LeftIdx,
    167283          string.Format(CultureInfo.InvariantCulture, "{0}{1}{2} <= {3:F}",
    168284          partialRule,
    169285          string.IsNullOrEmpty(partialRule) ? "" : " and ",
    170           n.varName,
    171           n.val), offset);
    172         WriteTree(tree, n.rightIdx,
     286          n.VarName,
     287          n.Val), offset);
     288        WriteTree(tree, n.RightIdx,
    173289          string.Format(CultureInfo.InvariantCulture, "{0}{1}{2} >  {3:F}",
    174290          partialRule,
    175291          string.IsNullOrEmpty(partialRule) ? "" : " and ",
    176           n.varName,
    177           n.val), offset);
    178       }
    179     }
     292          n.VarName,
     293          n.Val), offset);
     294      }
     295    }
     296    #endregion
    180297  }
    181298}
Note: See TracChangeset for help on using the changeset viewer.