Free cookie consent management tool by TermsFeed Policy Generator

source: stable/HeuristicLab.Algorithms.DataAnalysis.DecisionTrees/3.4/Splitting/SplitterBase.cs @ 17912

Last change on this file since 17912 was 17181, checked in by swagner, 5 years ago

#2875: Merged r17180 from trunk to stable

File size: 6.6 KB
RevLine 
[15830]1#region License Information
2/* HeuristicLab
[17181]3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[15830]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using System.Threading;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Problems.DataAnalysis;
[16847]29using HEAL.Attic;
[15830]30
31namespace HeuristicLab.Algorithms.DataAnalysis {
[16847]32  [StorableType("22DCCF28-8943-4622-BBD3-B2AB04F28C36")]
33  [Item("SplitterBase", "Abstract base class for splitters")]
[15830]34  public abstract class SplitterBase : ParameterizedNamedItem, ISplitter {
[16852]35    public const string SplittingStateVariableName = "SplittingState";
[15830]36
37    #region Constructors & Cloning
38    [StorableConstructor]
[16847]39    protected SplitterBase(StorableConstructorFlag _) { }
[15830]40    protected SplitterBase(SplitterBase original, Cloner cloner) : base(original, cloner) { }
41    public SplitterBase() { }
42    #endregion
43
44    #region ISplitType
45    public void Initialize(IScope states) {
46      states.Variables.Add(new Variable(SplittingStateVariableName, new SplittingState()));
47    }
[16847]48
[15830]49    public void Split(RegressionNodeTreeModel tree, IReadOnlyList<int> trainingRows, IScope stateScope, CancellationToken cancellationToken) {
[17080]50      var regressionTreeParams = (RegressionTreeParameters)stateScope.Variables[DecisionTreeRegression.RegressionTreeParameterVariableName].Value;
[15830]51      var splittingState = (SplittingState)stateScope.Variables[SplittingStateVariableName].Value;
52      var variables = regressionTreeParams.AllowedInputVariables.ToArray();
53      var target = regressionTreeParams.TargetVariable;
54
55      if (splittingState.Code <= 0) {
56        splittingState.nodeQueue.Enqueue(tree.Root);
57        splittingState.trainingRowsQueue.Enqueue(trainingRows);
58        splittingState.Code = 1;
59      }
60      while (splittingState.nodeQueue.Count != 0) {
61        var n = splittingState.nodeQueue.Dequeue();
62        var rows = splittingState.trainingRowsQueue.Dequeue();
63
64        string attr;
65        double splitValue;
66        var isLeaf = !DecideSplit(new RegressionProblemData(RegressionTreeUtilities.ReduceDataset(regressionTreeParams.Data, rows, variables, target), variables, target), regressionTreeParams.MinLeafSize, out attr, out splitValue);
67        if (isLeaf) continue;
68
69        IReadOnlyList<int> leftRows, rightRows;
70        RegressionTreeUtilities.SplitRows(rows, regressionTreeParams.Data, attr, splitValue, out leftRows, out rightRows);
71        n.Split(regressionTreeParams, attr, splitValue, rows.Count);
72
73        splittingState.nodeQueue.Enqueue(n.Left);
74        splittingState.nodeQueue.Enqueue(n.Right);
75        splittingState.trainingRowsQueue.Enqueue(leftRows);
76        splittingState.trainingRowsQueue.Enqueue(rightRows);
77        cancellationToken.ThrowIfCancellationRequested();
78      }
79    }
80
81    protected virtual bool DecideSplit(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue) {
[16852]82      var bestPos = -1;
[15830]83      var bestImpurity = double.MinValue;
84      var bestSplitValue = 0.0;
85      var bestSplitAttr = string.Empty;
86      splitAttr = bestSplitAttr;
87      splitValue = bestSplitValue;
88      if (splitData.Dataset.Rows < minLeafSize) return false;
89
[16847]90      // find best attribute for the splitter
[15830]91      foreach (var attr in splitData.AllowedInputVariables) {
92        int pos;
93        double impurity, sValue;
94        var sortedData = splitData.Dataset.GetDoubleValues(attr).Zip(splitData.Dataset.GetDoubleValues(splitData.TargetVariable), Tuple.Create).OrderBy(x => x.Item1).ToArray();
95        AttributeSplit(sortedData.Select(x => x.Item1).ToArray(), sortedData.Select(x => x.Item2).ToArray(), minLeafSize, out pos, out impurity, out sValue);
96        if (!(bestImpurity < impurity)) continue;
97        bestImpurity = impurity;
98        bestPos = pos;
99        bestSplitValue = sValue;
100        bestSplitAttr = attr;
101      }
102
103      splitAttr = bestSplitAttr;
104      splitValue = bestSplitValue;
105      //if no suitable split exists => leafNode
106      return bestPos + 1 >= minLeafSize && bestPos <= splitData.Dataset.Rows - minLeafSize;
107    }
108
109    protected abstract void AttributeSplit(IReadOnlyList<double> attValues, IReadOnlyList<double> targetValues, int minLeafSize, out int position, out double maxImpurity, out double splitValue);
110    #endregion
111
[16847]112    [StorableType("BC1149FD-370E-4F3A-92F5-6E519736D09A")]
[15830]113    public class SplittingState : Item {
[17159]114      public Queue<RegressionNodeModel> nodeQueue;
[15830]115      [Storable]
[17159]116      private RegressionNodeModel[] storableNodeQueue {
117        get { return nodeQueue.ToArray(); }
118        set { nodeQueue = new Queue<RegressionNodeModel>(value); }
119      }
120
121      public Queue<IReadOnlyList<int>> trainingRowsQueue;
[15830]122      [Storable]
[17159]123      private IReadOnlyList<int>[] storableTrainingRowsQueue {
124        get { return trainingRowsQueue.ToArray(); }
125        set { trainingRowsQueue = new Queue<IReadOnlyList<int>>(value); }
126      }
[15830]127
[17159]128
[15830]129      //State.Code values denote the current action (for pausing)
130      //0...nothing has been done;
131      //1...splitting nodes;
132      [Storable]
133      public int Code = 0;
134
135      #region HLConstructors & Cloning
136      [StorableConstructor]
[16847]137      protected SplittingState(StorableConstructorFlag _) : base(_) { }
[15830]138      protected SplittingState(SplittingState original, Cloner cloner) : base(original, cloner) {
139        nodeQueue = new Queue<RegressionNodeModel>(original.nodeQueue.Select(cloner.Clone));
140        trainingRowsQueue = new Queue<IReadOnlyList<int>>(original.trainingRowsQueue.Select(x => (IReadOnlyList<int>)x.ToArray()));
141        Code = original.Code;
142      }
[17159]143      public SplittingState() : base() {
144        nodeQueue = new Queue<RegressionNodeModel>();
145        trainingRowsQueue = new Queue<IReadOnlyList<int>>();
146      }
[15830]147      public override IDeepCloneable Clone(Cloner cloner) {
148        return new SplittingState(this, cloner);
149      }
150      #endregion
151    }
152  }
153}
Note: See TracBrowser for help on using the repository browser.