Free cookie consent management tool by TermsFeed Policy Generator

source: branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/Spliting/M5Spliter.cs @ 15614

Last change on this file since 15614 was 15614, checked in by bwerth, 7 years ago

#2847 made changes to M5 according to review comments

File size: 4.6 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Common;
26using HeuristicLab.Core;
27using HeuristicLab.Data;
28using HeuristicLab.Parameters;
29using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
30using HeuristicLab.Problems.DataAnalysis;
31
32namespace HeuristicLab.Algorithms.DataAnalysis {
33  [StorableClass]
34  [Item("M5Spliter", "A split selector that uses the ratio between Variances^1/Order to determine good splits")]
35  public class M5Spliter : ParameterizedNamedItem, ISpliter {
36    public const string OrderParameterName = "Order";
37    public IFixedValueParameter<DoubleValue> OrderParameter {
38      get { return (IFixedValueParameter<DoubleValue>)Parameters[OrderParameterName]; }
39    }
40    public double Order {
41      get { return OrderParameter.Value.Value; }
42    }
43
44    #region Constructors & Cloning
45    [StorableConstructor]
46    private M5Spliter(bool deserializing) { }
47    private M5Spliter(M5Spliter original, Cloner cloner) : base(original, cloner) { }
48    public M5Spliter() {
49      Parameters.Add(new FixedValueParameter<DoubleValue>(OrderParameterName, "The exponent in the split calculation sum (x_i - x_avg)^Order.", new DoubleValue(4)));
50    }
51    public override IDeepCloneable Clone(Cloner cloner) {
52      return new M5Spliter(this, cloner);
53    }
54    #endregion
55
56    #region ISplitType
57    public bool Split(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue) {
58      var bestPos = 0;
59      var bestImpurity = double.MinValue;
60      var bestSplitValue = 0.0;
61      var bestSplitAttr = string.Empty;
62      splitAttr = bestSplitAttr;
63      splitValue = bestSplitValue;
64      if (splitData.Dataset.Rows < minLeafSize) return false;
65      //find best Attribute for the Spliter
66      foreach (var attr in splitData.AllowedInputVariables) {
67        int pos;
68        double impurity, sValue;
69        var sortedData = splitData.Dataset.GetDoubleValues(attr).Zip(splitData.Dataset.GetDoubleValues(splitData.TargetVariable), Tuple.Create).OrderBy(x => x.Item1).ToArray();
70        AttributeSplit(sortedData.Select(x => x.Item1).ToArray(), sortedData.Select(x => x.Item2).ToArray(), minLeafSize, out pos, out impurity, out sValue);
71        if (!(bestImpurity < impurity)) continue;
72        bestImpurity = impurity;
73        bestPos = pos;
74        bestSplitValue = sValue;
75        bestSplitAttr = attr;
76      }
77
78      splitAttr = bestSplitAttr;
79      splitValue = bestSplitValue;
80      //if no suitable split exists => leafNode
81      return bestPos >= minLeafSize && bestPos <= splitData.Dataset.Rows - minLeafSize;
82    }
83
84    private void AttributeSplit(IReadOnlyList<double> attValues, IReadOnlyList<double> targetValues, int minLeafSize, out int position, out double maxImpurity, out double splitValue) {
85      position = 0;
86      maxImpurity = -1E20;
87      splitValue = 0.0;
88      var length = targetValues.Count;
89      var imp = new OrderImpurityCalculator(minLeafSize, targetValues, Order);
90      if (imp.Impurity > maxImpurity && !attValues[minLeafSize - 1].IsAlmost(attValues[minLeafSize])) {
91        maxImpurity = imp.Impurity;
92        splitValue = (attValues[minLeafSize - 1] + attValues[minLeafSize]) / 2;
93        position = minLeafSize;
94      }
95      for (var i = minLeafSize; i < length - minLeafSize; i++) {
96        imp.Increment(targetValues[i], OrderImpurityCalculator.IncrementType.Left);
97        if (attValues[i].IsAlmost(attValues[i + 1])) continue; //splits can not be made between to equal points
98        if (imp.Impurity < maxImpurity) continue;
99        maxImpurity = imp.Impurity;
100        splitValue = (attValues[i] + attValues[i + 1]) / 2;
101        position = i + 1;
102      }
103    }
104    #endregion
105  }
106}
Note: See TracBrowser for help on using the repository browser.