Changeset 13184 for stable/HeuristicLab.Algorithms.DataAnalysis
- Timestamp:
- 11/16/15 19:49:40 (9 years ago)
- Location:
- stable
- Files:
-
- 13 edited
- 3 copied
Legend:
- Unmodified
- Added
- Removed
-
stable
- Property svn:mergeinfo changed
/trunk/sources merged: 12868,12873,12875,13065-13066,13157-13158
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Algorithms.DataAnalysis
- Property svn:mergeinfo changed
/trunk/sources/HeuristicLab.Algorithms.DataAnalysis merged: 12868,12873,12875,13065-13066,13157-13158
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/GradientBoostedTreesAlgorithm.cs
r12632 r13184 82 82 get { return (FixedValueParameter<BoolValue>)Parameters[SetSeedRandomlyParameterName]; } 83 83 } 84 public IConstrainedValueParameter< StringValue> LossFunctionParameter {85 get { return (IConstrainedValueParameter< StringValue>)Parameters[LossFunctionParameterName]; }84 public IConstrainedValueParameter<ILossFunction> LossFunctionParameter { 85 get { return (IConstrainedValueParameter<ILossFunction>)Parameters[LossFunctionParameterName]; } 86 86 } 87 87 public IFixedValueParameter<IntValue> UpdateIntervalParameter { … … 164 164 Parameters[CreateSolutionParameterName].Hidden = true; 165 165 166 var lossFunctionNames = ApplicationManager.Manager.GetInstances<ILossFunction>().Select(l => new StringValue(l.ToString()).AsReadOnly()); 167 Parameters.Add(new ConstrainedValueParameter<StringValue>(LossFunctionParameterName, "The loss function", new ItemSet<StringValue>(lossFunctionNames))); 168 LossFunctionParameter.ActualValue = LossFunctionParameter.ValidValues.First(l => l.Value.Contains("Squared")); // squared error loss is the default 169 } 170 166 var lossFunctions = ApplicationManager.Manager.GetInstances<ILossFunction>(); 167 Parameters.Add(new ConstrainedValueParameter<ILossFunction>(LossFunctionParameterName, "The loss function", new ItemSet<ILossFunction>(lossFunctions))); 168 LossFunctionParameter.Value = LossFunctionParameter.ValidValues.First(f => f.ToString().Contains("Squared")); // squared error loss is the default 169 } 170 171 [StorableHook(HookType.AfterDeserialization)] 172 private void AfterDeserialization() { 173 // BackwardsCompatibility3.4 174 #region Backwards compatible code, remove with 3.5 175 // parameter type has been changed 176 var lossFunctionParam = Parameters[LossFunctionParameterName] as ConstrainedValueParameter<StringValue>; 177 if (lossFunctionParam != null) { 178 Parameters.Remove(LossFunctionParameterName); 179 var selectedValue = lossFunctionParam.Value; // to be restored below 180 181 var lossFunctions = ApplicationManager.Manager.GetInstances<ILossFunction>(); 182 Parameters.Add(new ConstrainedValueParameter<ILossFunction>(LossFunctionParameterName, "The loss function", new ItemSet<ILossFunction>(lossFunctions))); 183 // try to restore selected value 184 var selectedLossFunction = 185 LossFunctionParameter.ValidValues.FirstOrDefault(f => f.ToString() == selectedValue.Value); 186 if (selectedLossFunction != null) { 187 LossFunctionParameter.Value = selectedLossFunction; 188 } else { 189 LossFunctionParameter.Value = LossFunctionParameter.ValidValues.First(f => f.ToString().Contains("Squared")); // default: SE 190 } 191 } 192 #endregion 193 } 171 194 172 195 protected override void Run(CancellationToken cancellationToken) { … … 187 210 // init 188 211 var problemData = (IRegressionProblemData)Problem.ProblemData.Clone(); 189 var lossFunction = ApplicationManager.Manager.GetInstances<ILossFunction>() 190 .Single(l => l.ToString() == LossFunctionParameter.Value.Value); 212 var lossFunction = LossFunctionParameter.Value; 191 213 var state = GradientBoostedTreesAlgorithmStatic.CreateGbmState(problemData, lossFunction, (uint)Seed, MaxSize, R, M, Nu); 192 214 … … 233 255 // produce solution 234 256 if (CreateSolution) { 257 var model = state.GetModel(); 258 235 259 // for logistic regression we produce a classification solution 236 260 if (lossFunction is LogisticRegressionLoss) { 237 var model = new DiscriminantFunctionClassificationModel(state.GetModel(),261 var classificationModel = new DiscriminantFunctionClassificationModel(model, 238 262 new AccuracyMaximizationThresholdCalculator()); 239 263 var classificationProblemData = new ClassificationProblemData(problemData.Dataset, 240 264 problemData.AllowedInputVariables, problemData.TargetVariable, problemData.Transformations); 241 model.RecalculateModelParameters(classificationProblemData, classificationProblemData.TrainingIndices);242 243 var classificationSolution = new DiscriminantFunctionClassificationSolution( model, classificationProblemData);265 classificationModel.RecalculateModelParameters(classificationProblemData, classificationProblemData.TrainingIndices); 266 267 var classificationSolution = new DiscriminantFunctionClassificationSolution(classificationModel, classificationProblemData); 244 268 Results.Add(new Result("Solution", classificationSolution)); 245 269 } else { 246 270 // otherwise we produce a regression solution 247 Results.Add(new Result("Solution", new RegressionSolution( state.GetModel(), problemData)));271 Results.Add(new Result("Solution", new RegressionSolution(model, problemData))); 248 272 } 249 273 } -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/GradientBoostedTreesAlgorithmStatic.cs
r13156 r13184 52 52 internal RegressionTreeBuilder treeBuilder { get; private set; } 53 53 54 private readonly uint randSeed; 54 55 private MersenneTwister random { get; set; } 55 56 … … 71 72 this.m = m; 72 73 74 this.randSeed = randSeed; 73 75 random = new MersenneTwister(randSeed); 74 76 this.problemData = problemData; … … 99 101 100 102 public IRegressionModel GetModel() { 101 return new GradientBoostedTreesModel(models, weights); 103 #pragma warning disable 618 104 var model = new GradientBoostedTreesModel(models, weights); 105 #pragma warning restore 618 106 // we don't know the number of iterations here but the number of weights is equal 107 // to the number of iterations + 1 (for the constant model) 108 // wrap the actual model in a surrogate that enables persistence and lazy recalculation of the model if necessary 109 return new GradientBoostedTreesModelSurrogate(problemData, randSeed, lossFunction, weights.Count - 1, maxSize, r, m, nu, model); 102 110 } 103 111 public IEnumerable<KeyValuePair<string, double>> GetVariableRelevance() { … … 122 130 123 131 // simple interface 124 public static IRegressionSolution TrainGbm(IRegressionProblemData problemData, ILossFunction lossFunction, int maxSize, double nu, double r, double m, int maxIterations, uint randSeed = 31415) {132 public static GradientBoostedTreesSolution TrainGbm(IRegressionProblemData problemData, ILossFunction lossFunction, int maxSize, double nu, double r, double m, int maxIterations, uint randSeed = 31415) { 125 133 Contract.Assert(r > 0); 126 134 Contract.Assert(r <= 1.0); … … 135 143 136 144 var model = state.GetModel(); 137 return new RegressionSolution(model, (IRegressionProblemData)problemData.Clone());145 return new GradientBoostedTreesSolution(model, (IRegressionProblemData)problemData.Clone()); 138 146 } 139 147 -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/GradientBoostedTreesModel.cs
r12660 r13184 33 33 [Item("Gradient boosted tree model", "")] 34 34 // this is essentially a collection of weighted regression models 35 public sealed class GradientBoostedTreesModel : NamedItem, IRegressionModel { 36 [Storable] 35 public sealed class GradientBoostedTreesModel : NamedItem, IGradientBoostedTreesModel { 36 // BackwardsCompatibility3.4 for allowing deserialization & serialization of old models 37 #region Backwards compatible code, remove with 3.5 38 private bool isCompatibilityLoaded = false; // only set to true if the model is deserialized from the old format, needed to make sure that information is serialized again if it was loaded from the old format 39 40 [Storable(Name = "models")] 41 private IList<IRegressionModel> __persistedModels { 42 set { 43 this.isCompatibilityLoaded = true; 44 this.models.Clear(); 45 foreach (var m in value) this.models.Add(m); 46 } 47 get { if (this.isCompatibilityLoaded) return models; else return null; } 48 } 49 [Storable(Name = "weights")] 50 private IList<double> __persistedWeights { 51 set { 52 this.isCompatibilityLoaded = true; 53 this.weights.Clear(); 54 foreach (var w in value) this.weights.Add(w); 55 } 56 get { if (this.isCompatibilityLoaded) return weights; else return null; } 57 } 58 #endregion 59 37 60 private readonly IList<IRegressionModel> models; 38 61 public IEnumerable<IRegressionModel> Models { get { return models; } } 39 62 40 [Storable]41 63 private readonly IList<double> weights; 42 64 public IEnumerable<double> Weights { get { return weights; } } 43 65 44 66 [StorableConstructor] 45 private GradientBoostedTreesModel(bool deserializing) : base(deserializing) { } 67 private GradientBoostedTreesModel(bool deserializing) 68 : base(deserializing) { 69 models = new List<IRegressionModel>(); 70 weights = new List<double>(); 71 } 46 72 private GradientBoostedTreesModel(GradientBoostedTreesModel original, Cloner cloner) 47 73 : base(original, cloner) { 48 74 this.weights = new List<double>(original.weights); 49 75 this.models = new List<IRegressionModel>(original.models.Select(m => cloner.Clone(m))); 76 this.isCompatibilityLoaded = original.isCompatibilityLoaded; 50 77 } 78 [Obsolete("The constructor of GBTModel should not be used directly anymore (use GBTModelSurrogate instead)")] 51 79 public GradientBoostedTreesModel(IEnumerable<IRegressionModel> models, IEnumerable<double> weights) 52 80 : base("Gradient boosted tree model", string.Empty) { … … 64 92 // allocate target array go over all models and add up weighted estimation for each row 65 93 if (!rows.Any()) return Enumerable.Empty<double>(); // return immediately if rows is empty. This prevents multiple iteration over lazy rows enumerable. 66 94 // (which essentially looks up indexes in a dictionary) 67 95 var res = new double[rows.Count()]; 68 96 for (int i = 0; i < models.Count; i++) { -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/GradientBoostedTreesModelSurrogate.cs
r12868 r13184 21 21 #endregion 22 22 23 using System;24 23 using System.Collections.Generic; 25 using System.Linq;26 24 using HeuristicLab.Common; 27 25 using HeuristicLab.Core; 28 26 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 29 using HeuristicLab.PluginInfrastructure;30 27 using HeuristicLab.Problems.DataAnalysis; 31 28 … … 36 33 // recalculate the actual GBT model on demand 37 34 [Item("Gradient boosted tree model", "")] 38 public sealed class GradientBoostedTreesModelSurrogate : NamedItem, I RegressionModel {35 public sealed class GradientBoostedTreesModelSurrogate : NamedItem, IGradientBoostedTreesModel { 39 36 // don't store the actual model! 40 private I RegressionModel actualModel; // the actual model is only recalculated when necessary37 private IGradientBoostedTreesModel actualModel; // the actual model is only recalculated when necessary 41 38 42 39 [Storable] … … 45 42 private readonly uint seed; 46 43 [Storable] 47 private string lossFunctionName;44 private ILossFunction lossFunction; 48 45 [Storable] 49 46 private double r; … … 66 63 67 64 this.trainingProblemData = cloner.Clone(original.trainingProblemData); 65 this.lossFunction = cloner.Clone(original.lossFunction); 68 66 this.seed = original.seed; 69 this.lossFunctionName = original.lossFunctionName;70 67 this.iterations = original.iterations; 71 68 this.maxSize = original.maxSize; … … 76 73 77 74 // create only the surrogate model without an actual model 78 public GradientBoostedTreesModelSurrogate(IRegressionProblemData trainingProblemData, uint seed, string lossFunctionName, int iterations, int maxSize, double r, double m, double nu)75 public GradientBoostedTreesModelSurrogate(IRegressionProblemData trainingProblemData, uint seed, ILossFunction lossFunction, int iterations, int maxSize, double r, double m, double nu) 79 76 : base("Gradient boosted tree model", string.Empty) { 80 77 this.trainingProblemData = trainingProblemData; 81 78 this.seed = seed; 82 this.lossFunction Name = lossFunctionName;79 this.lossFunction = lossFunction; 83 80 this.iterations = iterations; 84 81 this.maxSize = maxSize; … … 89 86 90 87 // wrap an actual model in a surrograte 91 public GradientBoostedTreesModelSurrogate(IRegressionProblemData trainingProblemData, uint seed, string lossFunctionName, int iterations, int maxSize, double r, double m, double nu, IRegressionModel model)92 : this(trainingProblemData, seed, lossFunction Name, iterations, maxSize, r, m, nu) {88 public GradientBoostedTreesModelSurrogate(IRegressionProblemData trainingProblemData, uint seed, ILossFunction lossFunction, int iterations, int maxSize, double r, double m, double nu, IGradientBoostedTreesModel model) 89 : this(trainingProblemData, seed, lossFunction, iterations, maxSize, r, m, nu) { 93 90 this.actualModel = model; 94 91 } … … 109 106 110 107 111 private IRegressionModel RecalculateModel() { 112 var lossFunction = ApplicationManager.Manager.GetInstances<ILossFunction>().Single(l => l.ToString() == lossFunctionName); 108 private IGradientBoostedTreesModel RecalculateModel() { 113 109 return GradientBoostedTreesAlgorithmStatic.TrainGbm(trainingProblemData, lossFunction, maxSize, nu, r, m, iterations, seed).Model; 110 } 111 112 public IEnumerable<IRegressionModel> Models { 113 get { 114 if (actualModel == null) actualModel = RecalculateModel(); 115 return actualModel.Models; 116 } 117 } 118 119 public IEnumerable<double> Weights { 120 get { 121 if (actualModel == null) actualModel = RecalculateModel(); 122 return actualModel.Weights; 123 } 114 124 } 115 125 } -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/LossFunctions/AbsoluteErrorLoss.cs
r12700 r13184 23 23 using System; 24 24 using System.Collections.Generic; 25 using System.Diagnostics;26 using System.Linq;27 25 using HeuristicLab.Common; 26 using HeuristicLab.Core; 27 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 28 28 29 29 namespace HeuristicLab.Algorithms.DataAnalysis { 30 30 // loss function for the weighted absolute error 31 public class AbsoluteErrorLoss : ILossFunction { 31 [StorableClass] 32 [Item("Absolute error loss", "")] 33 public sealed class AbsoluteErrorLoss : Item, ILossFunction { 34 public AbsoluteErrorLoss() { } 35 32 36 public double GetLoss(IEnumerable<double> target, IEnumerable<double> pred) { 33 37 var targetEnum = target.GetEnumerator(); … … 77 81 } 78 82 79 public override string ToString() { 80 return "Absolute error loss"; 83 #region item implementation 84 [StorableConstructor] 85 private AbsoluteErrorLoss(bool deserializing) : base(deserializing) { } 86 87 private AbsoluteErrorLoss(AbsoluteErrorLoss original, Cloner cloner) : base(original, cloner) { } 88 89 public override IDeepCloneable Clone(Cloner cloner) { 90 return new AbsoluteErrorLoss(this, cloner); 81 91 } 92 #endregion 82 93 } 83 94 } -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/LossFunctions/ILossFunction.cs
r12700 r13184 22 22 23 23 using System.Collections.Generic; 24 using HeuristicLab.Core; 24 25 25 26 namespace HeuristicLab.Algorithms.DataAnalysis { … … 27 28 // target represents the target vector (original targets from the problem data, never changed) 28 29 // pred represents the current vector of predictions (a weighted combination of models learned so far, this vector is updated after each step) 29 public interface ILossFunction {30 public interface ILossFunction : IItem { 30 31 // returns the loss of the current prediction vector 31 32 double GetLoss(IEnumerable<double> target, IEnumerable<double> pred); -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/LossFunctions/LogisticRegressionLoss.cs
r12700 r13184 26 26 using System.Linq; 27 27 using HeuristicLab.Common; 28 using HeuristicLab.Core; 29 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 28 30 29 31 namespace HeuristicLab.Algorithms.DataAnalysis { 30 32 // Greedy Function Approximation: A Gradient Boosting Machine (page 9) 31 public class LogisticRegressionLoss : ILossFunction { 33 [StorableClass] 34 [Item("Logistic regression loss", "")] 35 public sealed class LogisticRegressionLoss : Item, ILossFunction { 36 public LogisticRegressionLoss() { } 37 32 38 public double GetLoss(IEnumerable<double> target, IEnumerable<double> pred) { 33 39 var targetEnum = target.GetEnumerator(); … … 83 89 } 84 90 85 public override string ToString() { 86 return "Logistic regression loss"; 91 #region item implementation 92 [StorableConstructor] 93 private LogisticRegressionLoss(bool deserializing) : base(deserializing) { } 94 95 private LogisticRegressionLoss(LogisticRegressionLoss original, Cloner cloner) : base(original, cloner) { } 96 97 public override IDeepCloneable Clone(Cloner cloner) { 98 return new LogisticRegressionLoss(this, cloner); 87 99 } 100 #endregion 101 88 102 } 89 103 } -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/LossFunctions/RelativeErrorLoss.cs
r12700 r13184 26 26 using System.Linq; 27 27 using HeuristicLab.Common; 28 using HeuristicLab.Core; 29 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 28 30 29 31 namespace HeuristicLab.Algorithms.DataAnalysis { 30 32 // relative error loss is a special case of weighted absolute error loss with weights = (1/target) 31 public class RelativeErrorLoss : ILossFunction { 33 [StorableClass] 34 [Item("Relative error loss", "")] 35 public sealed class RelativeErrorLoss : Item, ILossFunction { 36 public RelativeErrorLoss() { } 37 32 38 public double GetLoss(IEnumerable<double> target, IEnumerable<double> pred) { 33 39 var targetEnum = target.GetEnumerator(); … … 105 111 } 106 112 107 public override string ToString() { 108 return "Relative error loss"; 113 #region item implementation 114 [StorableConstructor] 115 private RelativeErrorLoss(bool deserializing) : base(deserializing) { } 116 117 private RelativeErrorLoss(RelativeErrorLoss original, Cloner cloner) : base(original, cloner) { } 118 119 public override IDeepCloneable Clone(Cloner cloner) { 120 return new RelativeErrorLoss(this, cloner); 109 121 } 122 #endregion 110 123 } 111 124 } -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/LossFunctions/SquaredErrorLoss.cs
r12700 r13184 24 24 using System.Collections.Generic; 25 25 using System.Linq; 26 using HeuristicLab.Common; 27 using HeuristicLab.Core; 28 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 26 29 27 30 namespace HeuristicLab.Algorithms.DataAnalysis { 28 public class SquaredErrorLoss : ILossFunction { 31 [StorableClass] 32 [Item("Squared error loss", "")] 33 public sealed class SquaredErrorLoss : Item, ILossFunction { 34 public SquaredErrorLoss() { } 35 29 36 public double GetLoss(IEnumerable<double> target, IEnumerable<double> pred) { 30 37 var targetEnum = target.GetEnumerator(); … … 70 77 } 71 78 72 public override string ToString() { 73 return "Squared error loss"; 79 #region item implementation 80 [StorableConstructor] 81 private SquaredErrorLoss(bool deserializing) : base(deserializing) { } 82 83 private SquaredErrorLoss(SquaredErrorLoss original, Cloner cloner) : base(original, cloner) { } 84 85 public override IDeepCloneable Clone(Cloner cloner) { 86 return new SquaredErrorLoss(this, cloner); 74 87 } 88 #endregion 75 89 } 76 90 } -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/RegressionTreeBuilder.cs
r12700 r13184 119 119 } 120 120 121 // simple API produces a single regression tree optimizing sum of squared errors122 // this can be used if only a simple regression tree should be produced123 // for a set of trees use the method CreateRegressionTreeForGradientBoosting below124 //125 // r and m work in the same way as for alglib random forest126 // r is fraction of rows to use for training127 // m is fraction of variables to use for training128 public IRegressionModel CreateRegressionTree(int maxSize, double r = 0.5, double m = 0.5) {129 // subtract mean of y first130 var yAvg = y.Average();131 for (int i = 0; i < y.Length; i++) y[i] -= yAvg;132 133 var seLoss = new SquaredErrorLoss();134 135 var model = CreateRegressionTreeForGradientBoosting(y, curPred, maxSize, problemData.TrainingIndices.ToArray(), seLoss, r, m);136 137 return new GradientBoostedTreesModel(new[] { new ConstantRegressionModel(yAvg), model }, new[] { 1.0, 1.0 });138 }139 140 121 // specific interface that allows to specify the target labels and the training rows which is necessary when for gradient boosted trees 141 122 public IRegressionModel CreateRegressionTreeForGradientBoosting(double[] y, double[] curPred, int maxSize, int[] idx, ILossFunction lossFunction, double r = 0.5, double m = 0.5) { -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GradientBoostedTrees/RegressionTreeModel.cs
r12700 r13184 82 82 [Storable] 83 83 // to prevent storing the references to data caches in nodes 84 // seemingly it is bad (performance-wise) to persist tuples (tuples are used as keys in a dictionary) TODO 84 85 private Tuple<string, double, int, int>[] SerializedTree { 85 86 get { return tree.Select(t => Tuple.Create(t.VarName, t.Val, t.LeftIdx, t.RightIdx)).ToArray(); } -
stable/HeuristicLab.Algorithms.DataAnalysis/3.4/HeuristicLab.Algorithms.DataAnalysis-3.4.csproj
r13156 r13184 200 200 <Compile Include="GaussianProcess\GaussianProcessRegressionSolution.cs" /> 201 201 <Compile Include="GaussianProcess\ICovarianceFunction.cs" /> 202 <Compile Include="GradientBoostedTrees\IGradientBoostedTreesModel.cs" /> 203 <Compile Include="GradientBoostedTrees\GradientBoostedTreesModelSurrogate.cs" /> 202 204 <Compile Include="GradientBoostedTrees\GradientBoostedTreesAlgorithm.cs" /> 203 205 <Compile Include="GradientBoostedTrees\GradientBoostedTreesAlgorithmStatic.cs" /> … … 209 211 <Compile Include="GradientBoostedTrees\LossFunctions\RelativeErrorLoss.cs" /> 210 212 <Compile Include="GradientBoostedTrees\LossFunctions\SquaredErrorLoss.cs" /> 213 <Compile Include="GradientBoostedTrees\GradientBoostedTreesSolution.cs" /> 211 214 <Compile Include="GradientBoostedTrees\RegressionTreeBuilder.cs" /> 212 215 <Compile Include="GradientBoostedTrees\RegressionTreeModel.cs" />
Note: See TracChangeset
for help on using the changeset viewer.