using System; using System.Collections; using System.Globalization; using System.Linq; using System.Runtime.CompilerServices; using System.Threading; using HeuristicLab.Data; using HeuristicLab.Optimization; using HeuristicLab.Problems.DataAnalysis; using HeuristicLab.Random; using Microsoft.VisualStudio.TestTools.UnitTesting; namespace HeuristicLab.Algorithms.DataAnalysis.GradientBoostedTrees { [TestClass()] public class Test { [TestMethod] [TestCategory("Algorithms.DataAnalysis")] [TestProperty("Time", "short")] public void DecisionTreeTest() { { var xy = new double[,] { {1, 20, 0}, {1, 20, 0}, {2, 10, 0}, {2, 10, 0}, }; var allVariables = new string[] { "y", "x1", "x2" }; // x1 <= 15 -> 2 // x1 > 15 -> 1 BuildTree(xy, allVariables, 10); } { var xy = new double[,] { {1, 20, 1}, {1, 20, -1}, {2, 10, -1}, {2, 10, 1}, }; var allVariables = new string[] { "y", "x1", "x2" }; // ignore irrelevant variables // x1 <= 15 -> 2 // x1 > 15 -> 1 BuildTree(xy, allVariables, 10); } { // split must be by x1 first var xy = new double[,] { {1, 20, 1}, {2, 20, -1}, {3, 10, -1}, {4, 10, 1}, }; var allVariables = new string[] { "y", "x1", "x2" }; // x1 <= 15 AND x2 <= 0 -> 3 // x1 <= 15 AND x2 > 0 -> 4 // x1 > 15 AND x2 <= 0 -> 2 // x1 > 15 AND x2 > 0 -> 1 BuildTree(xy, allVariables, 10); } { // averaging ys var xy = new double[,] { {0.5, 20, 1}, {1.5, 20, 1}, {1.5, 20, -1}, {2.5, 20, -1}, {2.5, 10, -1}, {3.5, 10, -1}, {3.5, 10, 1}, {4.5, 10, 1}, }; var allVariables = new string[] { "y", "x1", "x2" }; // x1 <= 15 AND x2 <= 0 -> 3 // x1 <= 15 AND x2 > 0 -> 4 // x1 > 15 AND x2 <= 0 -> 2 // x1 > 15 AND x2 > 0 -> 1 BuildTree(xy, allVariables, 10); } { // diagonal split (no split possible) var xy = new double[,] { {10, 1, 1}, {1, 1, 2}, {1, 2, 1}, {10, 2, 2}, }; var allVariables = new string[] { "y", "x1", "x2" }; // split cannot be found // -> 5.50 BuildTree(xy, allVariables, 3); } { // almost diagonal split var xy = new double[,] { {10, 1, 1}, {1, 1, 2}, {1, 2, 1}, {10.1, 2, 2}, }; var allVariables = new string[] { "y", "x1", "x2" }; // (two possible solutions) // x2 <= 1.5 -> 5.50 // x2 > 1.5 -> 5.55 BuildTree(xy, allVariables, 3); // x1 <= 1.5 AND x2 <= 1.5 -> 10 // x1 <= 1.5 AND x2 > 1.5 -> 1 // x1 > 1.5 AND x2 <= 1.5 -> 1 // x1 > 1.5 AND x2 > 1.5 -> 10.1 BuildTree(xy, allVariables, 7); } { // unbalanced split var xy = new double[,] { {-1, 1, 1}, {-1, 1, 2}, {0.9, 2, 1}, {1.1, 2, 2}, }; var allVariables = new string[] { "y", "x1", "x2" }; // x1 <= 1.5 -> -1.0 // x1 > 1.5 AND x2 <= 1.5 -> 0.9 // x1 > 1.5 AND x2 > 1.5 -> 1.1 BuildTree(xy, allVariables, 10); } { // unbalanced split var xy = new double[,] { {-1, 1, 1}, {-1, 1, 2}, {-1, 2, 1}, { 1, 2, 2}, }; var allVariables = new string[] { "y", "x1", "x2" }; // (two possible solutions) // x2 <= 1.5 -> -1.0 // x2 > 1.5 AND x1 <= 1.5 -> -1.0 // x2 > 1.5 AND x1 > 1.5 -> 1.0 BuildTree(xy, allVariables, 10); } } [TestMethod] [TestCategory("Algorithms.DataAnalysis")] [TestProperty("Time", "long")] public void GradientBoostingTestTowerSquaredError() { var gbt = new GradientBoostedTreesAlgorithm(); var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider(); var instance = provider.GetDataDescriptors().Single(x => x.Name.Contains("Tower")); var regProblem = new RegressionProblem(); regProblem.Load(provider.LoadData(instance)); #region Algorithm Configuration gbt.Problem = regProblem; gbt.Seed = 0; gbt.SetSeedRandomly = false; gbt.Iterations = 5000; gbt.MaxSize = 20; #endregion RunAlgorithm(gbt); Assert.AreEqual(267.68704241153921, ((DoubleValue)gbt.Results["Loss (train)"].Value).Value, 1E-6); Assert.AreEqual(393.84704062205469, ((DoubleValue)gbt.Results["Loss (test)"].Value).Value, 1E-6); } [TestMethod] [TestCategory("Algorithms.DataAnalysis")] [TestProperty("Time", "long")] public void GradientBoostingTestTowerAbsoluteError() { var gbt = new GradientBoostedTreesAlgorithm(); var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider(); var instance = provider.GetDataDescriptors().Single(x => x.Name.Contains("Tower")); var regProblem = new RegressionProblem(); regProblem.Load(provider.LoadData(instance)); #region Algorithm Configuration gbt.Problem = regProblem; gbt.Seed = 0; gbt.SetSeedRandomly = false; gbt.Iterations = 1000; gbt.MaxSize = 20; gbt.Nu = 0.02; gbt.LossFunctionParameter.Value = gbt.LossFunctionParameter.ValidValues.First(l => l.ToString().Contains("Absolute")); #endregion RunAlgorithm(gbt); Assert.AreEqual(10.551385044666661, ((DoubleValue)gbt.Results["Loss (train)"].Value).Value, 1E-6); Assert.AreEqual(12.918001745581172, ((DoubleValue)gbt.Results["Loss (test)"].Value).Value, 1E-6); } [TestMethod] [TestCategory("Algorithms.DataAnalysis")] [TestProperty("Time", "long")] public void GradientBoostingTestTowerRelativeError() { var gbt = new GradientBoostedTreesAlgorithm(); var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider(); var instance = provider.GetDataDescriptors().Single(x => x.Name.Contains("Tower")); var regProblem = new RegressionProblem(); regProblem.Load(provider.LoadData(instance)); #region Algorithm Configuration gbt.Problem = regProblem; gbt.Seed = 0; gbt.SetSeedRandomly = false; gbt.Iterations = 3000; gbt.MaxSize = 20; gbt.Nu = 0.005; gbt.LossFunctionParameter.Value = gbt.LossFunctionParameter.ValidValues.First(l => l.ToString().Contains("Relative")); #endregion RunAlgorithm(gbt); Assert.AreEqual(0.061954221604374943, ((DoubleValue)gbt.Results["Loss (train)"].Value).Value, 1E-6); Assert.AreEqual(0.06316303473499961, ((DoubleValue)gbt.Results["Loss (test)"].Value).Value, 1E-6); } // same as in SamplesUtil private void RunAlgorithm(IAlgorithm a) { var trigger = new EventWaitHandle(false, EventResetMode.ManualReset); Exception ex = null; a.Stopped += (src, e) => { trigger.Set(); }; a.ExceptionOccurred += (src, e) => { ex = e.Value; trigger.Set(); }; a.Prepare(); a.Start(); trigger.WaitOne(); Assert.AreEqual(ex, null); } #region helper private void BuildTree(double[,] xy, string[] allVariables, int maxDepth) { int nRows = xy.GetLength(0); var allowedInputs = allVariables.Skip(1); var dataset = new Dataset(allVariables, xy); var problemData = new RegressionProblemData(dataset, allowedInputs, allVariables.First()); problemData.TrainingPartition.Start = 0; problemData.TrainingPartition.End = nRows; problemData.TestPartition.Start = nRows; problemData.TestPartition.End = nRows; var rand = new MersenneTwister(31415); var builder = new RegressionTreeBuilder(problemData, rand); var model = (GradientBoostedTreesModel)builder.CreateRegressionTree(maxDepth, 1, 1); // maximal depth and use all rows and cols var constM = model.Models.First() as ConstantRegressionModel; var treeM = model.Models.Skip(1).First() as RegressionTreeModel; WriteTree(treeM.tree, 0, "", constM.Constant); Console.WriteLine(); } private void WriteTree(RegressionTreeModel.TreeNode[] tree, int idx, string partialRule, double offset) { var n = tree[idx]; if (n.VarName == RegressionTreeModel.TreeNode.NO_VARIABLE) { Console.WriteLine("{0} -> {1:F}", partialRule, n.Val + offset); } else { WriteTree(tree, n.LeftIdx, string.Format(CultureInfo.InvariantCulture, "{0}{1}{2} <= {3:F}", partialRule, string.IsNullOrEmpty(partialRule) ? "" : " and ", n.VarName, n.Val), offset); WriteTree(tree, n.RightIdx, string.Format(CultureInfo.InvariantCulture, "{0}{1}{2} > {3:F}", partialRule, string.IsNullOrEmpty(partialRule) ? "" : " and ", n.VarName, n.Val), offset); } } #endregion } }