Ignore:
Timestamp:
04/19/19 13:06:11 (4 months ago)
Author:
gkronber
Message:

#2847: made some minor changes while reviewing

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/2847_M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafTypes/M5Leaf.cs

    r15967 r16847  
    2626using HeuristicLab.Common;
    2727using HeuristicLab.Core;
    28 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
    2928using HeuristicLab.Problems.DataAnalysis;
     29using HEAL.Attic;
    3030
    3131namespace HeuristicLab.Algorithms.DataAnalysis {
    32   [StorableClass]
    33   [Item("M5Leaf", "A leaf type that uses linear models as leaf models. This is the standard for M5' regression")]
     32  [StorableType("58517042-5318-4087-B098-AC75F0208BA0")]
     33  [Item("M5Leaf", "A leaf type that uses regularized linear models with feature selection as leaf models.")]
    3434  public class M5Leaf : LeafBase {
    3535    #region Constructors & Cloning
    3636    [StorableConstructor]
    37     private M5Leaf(bool deserializing) : base(deserializing) { }
     37    private M5Leaf(StorableConstructorFlag _) : base(_) { }
    3838    private M5Leaf(M5Leaf original, Cloner cloner) : base(original, cloner) { }
    3939    public M5Leaf() { }
     
    4747      get { return false; }
    4848    }
    49     public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) {
     49    public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters) {
    5050      if (pd.Dataset.Rows == 0) throw new ArgumentException("The number of training instances is too small to create an M5 leaf model");
    5151
    5252      if (pd.Dataset.Rows == 1)
    53         return new ConstantLeaf().Build(pd, random, cancellationToken, out noParameters);
     53        return new ConstantLeaf().Build(pd, random, cancellationToken, out numberOfParameters);
    5454
    5555      var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Average());
     
    5757      var used = pd.AllowedInputVariables.Where(v => !variances[v].IsAlmost(0.0)).ToList();
    5858
    59       var classMean = pd.TargetVariableTrainingValues.Average();
    60       var classVar = pd.TargetVariableTrainingValues.Variance();
     59      var targetMean = pd.TargetVariableTrainingValues.Average();
     60      var targetVariance = pd.TargetVariableTrainingValues.Variance();
    6161
    62       var model = FindBestModel(variances, means, classMean, classVar, pd, used);
    63       noParameters = 1 + model.Coefficients.Count;
     62      var model = FindBestModel(variances, means, targetMean, targetVariance, pd, used);
     63      numberOfParameters = 1 + model.Coefficients.Count;
    6464      return model;
    6565    }
     
    6969    #endregion
    7070
    71     private static PreconstructedLinearModel FindBestModel(Dictionary<string, double> variances, Dictionary<string, double> means, double cMean, double cVar, IRegressionProblemData pd, IList<string> variables) {
     71    private static PreconstructedLinearModel FindBestModel(Dictionary<string, double> variances, Dictionary<string, double> means, double yMean, double yVariance, IRegressionProblemData pd, IList<string> variables) {
    7272      Dictionary<string, double> coeffs;
    7373      double intercept;
    7474      do {
    75         coeffs = DoRegression(pd, variables, variances, means, cMean, 1.0e-8, out intercept);
    76         variables = DeselectColinear(variances, coeffs, cVar, pd, variables);
     75        coeffs = DoRegression(pd, variables, variances, means, yMean, 1.0e-8, out intercept);
     76        variables = DeselectColinear(variances, coeffs, yVariance, pd, variables);
    7777      }
    7878      while (coeffs.Count != variables.Count);
     
    8888        improved = false;
    8989        currentNumAttributes--;
    90         // Find attribute with smallest SC
    91         var candidate = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / cVar)))
     90        // Find attribute with smallest SC (variance-scaled coefficient)
     91        var candidate = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance)))
    9292          .OrderBy(x => x.Value).Select(x => x.Key).First();
    9393
    9494        var currVariables = variables.Where(v => !v.Equals(candidate)).ToList();
    9595        var currentIntercept = 0.0;
    96         var currentCoeffs = DoRegression(pd, currVariables, variances, means, cMean, 1.0e-8, out currentIntercept);
     96        var currentCoeffs = DoRegression(pd, currVariables, variances, means, yMean, 1.0e-8, out currentIntercept);
    9797        var currentMse = CalculateSE(currentCoeffs, currentIntercept, pd, currVariables);
    9898        var currentAkaike = currentMse / fullMse * (numInst - numAtts) + 2 * currentNumAttributes;
     
    115115    }
    116116
    117     private static Dictionary<string, double> DoRegression(IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> variances, Dictionary<string, double> means, double cmean, double ridge, out double intercept) {
    118       //if (pd.TrainingIndices.Count() > variables.Count) {
    119       //  var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable);
    120       //  pd2.TestPartition.End = pd.TestPartition.End;
    121       //  pd2.TestPartition.Start = pd.TestPartition.Start;
    122       //  pd2.TrainingPartition.End = pd.TrainingPartition.End;
    123       //  pd2.TrainingPartition.Start = pd.TrainingPartition.Start;
    124       //
    125       //  double x1, x2;
    126       //  var lm = PreconstructedLinearModel.CreateLinearModel(pd2, out x1, out x2);
    127       //  intercept = lm.Intercept;
    128       //  return lm.Coefficients;
     117    private static Dictionary<string, double> DoRegression(IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> variances, Dictionary<string, double> means, double yMean, double ridge, out double intercept) {
    129118
    130119      var n = variables.Count;
     
    182171      if (coefficients == null) throw new ArgumentException("No linear model could be built");
    183172
    184       intercept = cmean;
     173      intercept = yMean;
    185174      var res = new Dictionary<string, double>();
    186175      for (var i = 0; i < n; i++) {
     
    193182    }
    194183
    195     private static IList<string> DeselectColinear(Dictionary<string, double> variances, Dictionary<string, double> coeffs, double cVar, IRegressionProblemData pd, IList<string> variables) {
    196       var candidates = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / cVar))).Where(x => x.Value > 1.5).OrderBy(x => -x.Value).ToList();
     184    private static IList<string> DeselectColinear(Dictionary<string, double> variances, Dictionary<string, double> coeffs, double yVariance, IRegressionProblemData pd, IList<string> variables) {
     185      var candidates = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance))).Where(x => x.Value > 1.5).OrderBy(x => -x.Value).ToList();
    197186      if (candidates.Count == 0) return variables;
    198187      var c = candidates.First().Key;
Note: See TracChangeset for help on using the changeset viewer.