Changeset 15833


Ignore:
Timestamp:
03/08/18 10:57:03 (15 months ago)
Author:
bwerth
Message:

#2847: added handling of empty and underdetermined data sets

Location:
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/GBM/GradientBoostingRegressionAlgorithm.cs

    r14523 r15833  
    4545  [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 350)]
    4646  public class GradientBoostingRegressionAlgorithm : FixedDataAnalysisAlgorithm<IRegressionProblem> {
    47 
    4847    #region ParameterNames
    49 
    5048    private const string IterationsParameterName = "Iterations";
    5149    private const string NuParameterName = "Nu";
     
    5856    private const string StoreRunsParameterName = "StoreRuns";
    5957    private const string RegressionAlgorithmSolutionResultParameterName = "RegressionAlgorithmResult";
    60 
    6158    #endregion
    6259
    6360    #region ParameterProperties
    64 
    6561    public IFixedValueParameter<IntValue> IterationsParameter {
    6662      get { return (IFixedValueParameter<IntValue>)Parameters[IterationsParameterName]; }
     
    10298      get { return (IFixedValueParameter<BoolValue>)Parameters[StoreRunsParameterName]; }
    10399    }
    104 
    105100    #endregion
    106101
    107102    #region Properties
    108 
    109103    public int Iterations {
    110104      get { return IterationsParameter.Value.Value; }
     
    155149      set { RegressionAlgorithmSolutionResultParameter.Value.Value = value; }
    156150    }
    157 
    158151    #endregion
    159152
    160153    [StorableConstructor]
    161154    protected GradientBoostingRegressionAlgorithm(bool deserializing)
    162       : base(deserializing) {
    163     }
     155      : base(deserializing) { }
    164156
    165157    protected GradientBoostingRegressionAlgorithm(GradientBoostingRegressionAlgorithm original, Cloner cloner)
    166       : base(original, cloner) {
    167     }
     158      : base(original, cloner) { }
    168159
    169160    public override IDeepCloneable Clone(Cloner cloner) {
     
    232223      var problemData = Problem.ProblemData;
    233224      var targetVarName = problemData.TargetVariable;
    234       var activeVariables = problemData.AllowedInputVariables.Concat(new string[] { problemData.TargetVariable });
     225      var activeVariables = problemData.AllowedInputVariables.Concat(new string[] {problemData.TargetVariable});
    235226      var modifiableDataset = new ModifiableDataset(
    236227        activeVariables,
     
    252243      List<IRegressionModel> models = new List<IRegressionModel>();
    253244      try {
    254 
    255245        // Loop until iteration limit reached or canceled.
    256246        for (int i = 0; i < Iterations; i++) {
     
    258248
    259249          modifiableDataset.RemoveVariable(targetVarName);
    260           modifiableDataset.AddVariable(targetVarName, curY.Concat(curYTest));
     250          modifiableDataset.AddVariable(targetVarName, curY.Concat(curYTest).ToList());
    261251
    262252          SampleTrainingData(rand, modifiableDataset, rRows, problemData.Dataset, curY, problemData.TargetVariable, problemData.TrainingIndices); // all training indices from the original problem data are allowed
     
    301291
    302292            models.Add(model);
    303 
    304 
    305293          }
    306294
     
    363351      alglib.lrunpack(lm, out coefficients, out features);
    364352
    365       var ensembleModel = new RegressionEnsembleModel(models, coefficients.Take(models.Count)) { AverageModelEstimates = false };
     353      var ensembleModel = new RegressionEnsembleModel(models, coefficients.Take(models.Count)) {AverageModelEstimates = false};
    366354      var ensembleSolution = (IRegressionEnsembleSolution)ensembleModel.CreateRegressionSolution(problemData);
    367355      return ensembleSolution;
     
    442430        prob.ProblemDataParameter.Value = problemData;
    443431        return true;
    444       } else return false;
     432      }
     433      else return false;
    445434    }
    446435
     
    478467          // NaN evaluations would not be critical but are problematic if we want to combine all symbolic models into a single symbolic model
    479468          if (symbRegSol == null ||
    480             (symbRegSol.TrainingLowerEstimationLimitHits == 0 && symbRegSol.TrainingUpperEstimationLimitHits == 0 &&
    481              symbRegSol.TestLowerEstimationLimitHits == 0 && symbRegSol.TestUpperEstimationLimitHits == 0) &&
    482             symbRegSol.TrainingNaNEvaluations == 0 && symbRegSol.TestNaNEvaluations == 0) {
     469              (symbRegSol.TrainingLowerEstimationLimitHits == 0 && symbRegSol.TrainingUpperEstimationLimitHits == 0 &&
     470               symbRegSol.TestLowerEstimationLimitHits == 0 && symbRegSol.TestUpperEstimationLimitHits == 0) &&
     471              symbRegSol.TrainingNaNEvaluations == 0 && symbRegSol.TestNaNEvaluations == 0) {
    483472            model = sol.Model;
    484473          }
     
    499488        ((BoolValue)paramItem.Parameters["SetSeedRandomly"].ActualValue).Value = false;
    500489        ((IntValue)paramItem.Parameters["Seed"].ActualValue).Value = seed;
    501       } else {
     490      }
     491      else {
    502492        throw new ArgumentException("Base learner does not have a seed parameter (algorithm {0})", alg.Name);
    503493      }
    504 
    505494    }
    506495  }
  • branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafModels/PreconstructedLinearModel.cs

    r15830 r15833  
    5959      SampleSize = original.SampleSize;
    6060    }
    61     public PreconstructedLinearModel(Dictionary<string, double> means, Dictionary<string, double> variances, Dictionary<string, double> coefficients, double intercept, string targetvariable) : base(targetvariable) {
     61    public PreconstructedLinearModel(Dictionary<string, double> means, Dictionary<string, double> variances, Dictionary<string, double> coefficients, double intercept, string targetvariable, double residualVariance = 0, double sampleSize = 0) : base(targetvariable) {
    6262      Coefficients = coefficients;
    6363      Intercept = intercept;
     
    178178      if (SampleSize == 0) return 0.0;
    179179      var sum = (from var in Variances let d = dataset.GetDoubleValue(var.Key, row) - Means[var.Key] select d * d / var.Value).Sum();
    180       var res = ResidualVariance * (1.0 / SampleSize + sum / (SampleSize - 1));
     180      var res = ResidualVariance * (SampleSize - 1) / (SampleSize - 2) * (1.0 / SampleSize + sum / (SampleSize - 1));
    181181      if (double.IsInfinity(res) || double.IsNaN(res)) return 0.0;
    182       return res;
     182      return Math.Sqrt(res);
    183183    }
    184184    #endregion
  • branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/M5Regression.cs

    r15830 r15833  
    148148    #region Static Interface
    149149    public static IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, IRandom random, ILeafModel leafModel = null, ISplitter splitter = null, IPruning pruning = null,
    150       bool useHoldout = false, double holdoutSize = 0.2, int minimumLeafSize = 4, bool generateRules = false, ResultCollection results = null, CancellationToken? cancellationToken = null) {
     150      bool useHoldout = false, double holdoutSize = 0.2, int minimumLeafSize = 1, bool generateRules = false, ResultCollection results = null, CancellationToken? cancellationToken = null) {
    151151      if (leafModel == null) leafModel = new LinearLeaf();
    152152      if (splitter == null) splitter = new M5Splitter();
     
    170170
    171171    #region Helpers
    172     private static IScope InitializeScope(IRandom random, IRegressionProblemData problemData, IPruning pruning, int minLeafSize, ILeafModel leafModel, ISplitter splitter, bool GenerateRules, bool useHoldout, double holdoutSize) {
     172    private static IScope InitializeScope(IRandom random, IRegressionProblemData problemData, IPruning pruning, int minLeafSize, ILeafModel leafModel, ISplitter splitter, bool generateRules, bool useHoldout, double holdoutSize) {
    173173      var stateScope = new Scope("RegressionTreeStateScope");
    174174
     
    196196      //store unbuilt model
    197197      IItem model;
    198       if (GenerateRules) {
     198      if (generateRules) {
    199199        model = RegressionRuleSetModel.CreateRuleModel(problemData.TargetVariable, regressionTreeParams);
    200200        RegressionRuleSetModel.Initialize(stateScope);
     
    215215
    216216    private static IRegressionModel Build(IScope stateScope, ResultCollection results, CancellationToken cancellationToken) {
     217      var regressionTreeParams = (RegressionTreeParameters)stateScope.Variables[RegressionTreeParameterVariableName].Value;
    217218      var model = (IM5Model)stateScope.Variables[ModelVariableName].Value;
    218219      var trainingRows = (IntArray)stateScope.Variables[TrainingSetVariableName].Value;
    219220      var pruningRows = (IntArray)stateScope.Variables[PruningSetVariableName].Value;
     221      if (1 > trainingRows.Length)
     222        return new PreconstructedLinearModel(new Dictionary<string, double>(), new Dictionary<string, double>(), new Dictionary<string, double>(), 0, regressionTreeParams.TargetVariable);
     223      if (regressionTreeParams.MinLeafSize > trainingRows.Length) {
     224        var targets = regressionTreeParams.Data.GetDoubleValues(regressionTreeParams.TargetVariable).ToArray();
     225        return new PreconstructedLinearModel(new Dictionary<string, double>(), new Dictionary<string, double>(), new Dictionary<string, double>(), targets.Average(), regressionTreeParams.TargetVariable, targets.Variance(), targets.Length);
     226      }
    220227      model.Build(trainingRows.ToArray(), pruningRows.ToArray(), stateScope, results, cancellationToken);
    221228      return model;
Note: See TracChangeset for help on using the changeset viewer.