source:
branches/2847_M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/m5.patch
@
16848
Last change on this file since 16848 was 15830, checked in by bwerth, 6 years ago | |
---|---|
File size: 108.4 KB |
-
Interfaces/IM5Model.cs
24 24 using HeuristicLab.Problems.DataAnalysis; 25 25 26 26 namespace HeuristicLab.Algorithms.DataAnalysis { 27 internalinterface IM5Model : IRegressionModel {27 public interface IM5Model : IRegressionModel { 28 28 void Build(IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken); 29 29 void Update(IReadOnlyList<int> rows, M5Parameters m5Parameters, CancellationToken cancellationToken); 30 30 } -
Interfaces/IPruning.cs
19 19 */ 20 20 #endregion 21 21 using System.Collections.Generic; 22 using System.Threading; 22 23 using HeuristicLab.Core; 23 24 using HeuristicLab.Problems.DataAnalysis; 24 25 … … 25 26 namespace HeuristicLab.Algorithms.DataAnalysis { 26 27 public interface IPruning : IParameterizedNamedItem { 27 28 int MinLeafSize(IRegressionProblemData pd, ILeafModel leafModel); 29 30 void Prune(M5TreeModel treeModel, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken); 28 31 } 29 32 } 33 No newline at end of file -
Interfaces/ISplitter.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 using HeuristicLab.Core; 22 using HeuristicLab.Problems.DataAnalysis; 23 24 namespace HeuristicLab.Algorithms.DataAnalysis { 25 public interface ISplitter : IParameterizedNamedItem { 26 /// <summary> 27 /// decides wether a node sould be split 28 /// and if so at which attribute and which value 29 /// </summary> 30 /// <param name="splitData"></param> 31 /// <param name="minLeafSize"></param> 32 /// <param name="splitAttr"></param> 33 /// <param name="splitValue"></param> 34 /// <returns></returns> 35 bool Split(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue); 36 } 37 } 38 No newline at end of file -
LeafModels/ComponentReducedLinearModel.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System.Collections.Generic; 23 using System.Linq; 24 using HeuristicLab.Common; 25 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 26 using HeuristicLab.Problems.DataAnalysis; 27 28 namespace HeuristicLab.Algorithms.DataAnalysis { 29 [StorableClass] 30 public class ComponentReducedLinearModel : RegressionModel, IConfidenceRegressionModel { 31 [Storable] 32 private IConfidenceRegressionModel Model; 33 [Storable] 34 private PrincipleComponentTransformation Pca; 35 36 [StorableConstructor] 37 private ComponentReducedLinearModel(bool deserializing) : base(deserializing) { } 38 private ComponentReducedLinearModel(ComponentReducedLinearModel original, Cloner cloner) : base(original, cloner) { 39 Model = cloner.Clone(original.Model); 40 Pca = cloner.Clone(original.Pca); 41 } 42 public ComponentReducedLinearModel(string targetVariable, IConfidenceRegressionModel model, PrincipleComponentTransformation pca) : base(targetVariable) { 43 Model = model; 44 Pca = pca; 45 } 46 public override IDeepCloneable Clone(Cloner cloner) { 47 return new ComponentReducedLinearModel(this, cloner); 48 } 49 50 public override IEnumerable<string> VariablesUsedForPrediction { 51 get { return Model.VariablesUsedForPrediction; } 52 } 53 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 54 var data = ReduceDataset(dataset, rows.ToArray()); 55 return Model.GetEstimatedValues(Pca.TransformDataset(data), Enumerable.Range(0, data.Rows)); 56 } 57 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { 58 return new ConfidenceRegressionSolution(this, problemData); 59 } 60 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 61 var data = ReduceDataset(dataset, rows.ToArray()); 62 return Model.GetEstimatedVariances(Pca.TransformDataset(data), Enumerable.Range(0, data.Rows)); 63 } 64 65 private IDataset ReduceDataset(IDataset data, IReadOnlyList<int> rows) { 66 return new Dataset(data.DoubleVariables, data.DoubleVariables.Select(v => data.GetDoubleValues(v, rows).ToList())); 67 } 68 } 69 } 70 No newline at end of file -
LeafModels/DampenedLinearModel.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 25 using HeuristicLab.Common; 26 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 27 using HeuristicLab.Problems.DataAnalysis; 28 29 namespace HeuristicLab.Algorithms.DataAnalysis { 30 //mulitdimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf 31 [StorableClass] 32 public class DampenedLinearModel : RegressionModel, IConfidenceRegressionModel { 33 [Storable] 34 private IConfidenceRegressionModel Model; 35 [Storable] 36 private double Min; 37 [Storable] 38 private double Max; 39 [Storable] 40 private double Dampening; 41 42 [StorableConstructor] 43 private DampenedLinearModel(bool deserializing) : base(deserializing) { } 44 private DampenedLinearModel(DampenedLinearModel original, Cloner cloner) : base(original, cloner) { 45 Model = cloner.Clone(original.Model); 46 Min = original.Min; 47 Max = original.Max; 48 Dampening = original.Dampening; 49 } 50 public DampenedLinearModel(IConfidenceRegressionModel model, IRegressionProblemData pd, double dampening) : base(model.TargetVariable) { 51 Model = model; 52 Min = pd.TargetVariableTrainingValues.Min(); 53 Max = pd.TargetVariableTrainingValues.Max(); 54 Dampening = dampening; 55 } 56 public override IDeepCloneable Clone(Cloner cloner) { 57 return new DampenedLinearModel(this, cloner); 58 } 59 public override IEnumerable<string> VariablesUsedForPrediction { 60 get { return Model.VariablesUsedForPrediction; } 61 } 62 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 63 var slow = Sigmoid(-Dampening); 64 var shigh = Sigmoid(Dampening); 65 foreach (var x in Model.GetEstimatedValues(dataset, rows)) { 66 var y = Rescale(x, Min, Max, -Dampening, Dampening); 67 y = Sigmoid(y); 68 y = Rescale(y, slow, shigh, Min, Max); 69 yield return y; 70 } 71 } 72 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { 73 return new ConfidenceRegressionSolution(this, problemData); 74 } 75 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 76 return Model.GetEstimatedVariances(dataset, rows); 77 } 78 79 private static double Rescale(double x, double oMin, double oMax, double nMin, double nMax) { 80 var d = oMax - oMin; 81 var nd = nMax - nMin; 82 if (d.IsAlmost(0)) { 83 d = 1; 84 nMin += nd / 2; 85 nd = 0; 86 } 87 return ((x - oMin) / d) * nd + nMin; 88 } 89 private static double Sigmoid(double x) { 90 return 1 / (1 + Math.Exp(-x)); 91 } 92 } 93 } 94 No newline at end of file -
LeafModels/PreconstructedLinearModel.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Diagnostics; 25 using System.Linq; 26 using HeuristicLab.Common; 27 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 28 using HeuristicLab.Problems.DataAnalysis; 29 30 namespace HeuristicLab.Algorithms.DataAnalysis { 31 //mulitdimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf 32 [StorableClass] 33 internal sealed class PreconstructedLinearModel : RegressionModel, IConfidenceRegressionModel { 34 [Storable] 35 public Dictionary<string, double> Coefficients { get; private set; } 36 [Storable] 37 public double Intercept { get; private set; } 38 [Storable] 39 private Dictionary<string, double> Means { get; set; } 40 [Storable] 41 private Dictionary<string, double> Variances { get; set; } 42 [Storable] 43 private double ResidualVariance { get; set; } 44 [Storable] 45 private int SampleSize { get; set; } 46 47 public override IEnumerable<string> VariablesUsedForPrediction { 48 get { return Coefficients.Keys; } 49 } 50 #region HLConstructors 51 [StorableConstructor] 52 private PreconstructedLinearModel(bool deserializing) : base(deserializing) { } 53 private PreconstructedLinearModel(PreconstructedLinearModel original, Cloner cloner) : base(original, cloner) { 54 if (original.Coefficients != null) Coefficients = original.Coefficients.ToDictionary(x => x.Key, x => x.Value); 55 Intercept = original.Intercept; 56 if (original.Means != null) Means = original.Means.ToDictionary(x => x.Key, x => x.Value); 57 if (original.Variances != null) Variances = original.Variances.ToDictionary(x => x.Key, x => x.Value); 58 ResidualVariance = original.ResidualVariance; 59 SampleSize = original.SampleSize; 60 } 61 public PreconstructedLinearModel(Dictionary<string, double> means, Dictionary<string, double> variances, Dictionary<string, double> coefficients, double intercept, string targetvariable) : base(targetvariable) { 62 Coefficients = coefficients; 63 Intercept = intercept; 64 Variances = variances; 65 Means = means; 66 ResidualVariance = 0; 67 SampleSize = 0; 68 } 69 public PreconstructedLinearModel(double intercept, string targetvariable) : base(targetvariable) { 70 Coefficients = new Dictionary<string, double>(); 71 Intercept = intercept; 72 Variances = new Dictionary<string, double>(); 73 ResidualVariance = 0; 74 SampleSize = 0; 75 } 76 public override IDeepCloneable Clone(Cloner cloner) { 77 return new PreconstructedLinearModel(this, cloner); 78 } 79 #endregion 80 81 public static PreconstructedLinearModel CreateConfidenceLinearModel(IRegressionProblemData pd, out double rmse, out double cvRmse) { 82 rmse = double.NaN; 83 cvRmse = double.NaN; 84 return AlternativeCalculation(pd); 85 } 86 87 private static PreconstructedLinearModel ClassicCalculation(IRegressionProblemData pd, out double rmse, out double cvRmse) { 88 var inputMatrix = pd.Dataset.ToArray(pd.AllowedInputVariables.Concat(new[] { 89 pd.TargetVariable 90 }), pd.AllIndices); 91 92 var nFeatures = inputMatrix.GetLength(1) - 1; 93 double[] coefficients; 94 95 alglib.linearmodel lm; 96 alglib.lrreport ar; 97 int retVal; 98 alglib.lrbuild(inputMatrix, inputMatrix.GetLength(0), nFeatures, out retVal, out lm, out ar); 99 if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution"); 100 rmse = ar.rmserror; 101 cvRmse = ar.cvrmserror; 102 103 alglib.lrunpack(lm, out coefficients, out nFeatures); 104 105 106 var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Average()); 107 var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Variance()); 108 var coeffs = pd.AllowedInputVariables.Zip(coefficients, (s, d) => new {s, d}).ToDictionary(x => x.s, x => x.d); 109 var res = new PreconstructedLinearModel(means, variances, coeffs, coefficients[nFeatures], pd.TargetVariable); 110 111 res.ResidualVariance = pd.TargetVariableValues.Zip(res.GetEstimatedValues(pd.Dataset, pd.TrainingIndices), (x, y) => x - y).Variance(); 112 res.SampleSize = pd.TrainingIndices.Count(); 113 return res; 114 } 115 116 private static PreconstructedLinearModel AlternativeCalculation(IRegressionProblemData pd) { 117 var means = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Average()); 118 var variances = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Variance()); 119 var cmean = pd.TargetVariableTrainingValues.Average(); 120 var variables = pd.AllowedInputVariables.ToList(); 121 var n = variables.Count; 122 var m = pd.TrainingIndices.Count(); 123 124 //Set up X^T and y 125 var inTr = new double[n + 1, m]; 126 for (var i = 0; i < n; i++) { 127 var v = variables[i]; 128 var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray(); 129 for (var j = 0; j < m; j++) inTr[i, j] = vdata[j]; 130 } 131 132 for (var i = 0; i < m; i++) inTr[n, i] = 1; 133 134 var y = new double[m, 1]; 135 var ydata = pd.TargetVariableTrainingValues.ToArray(); 136 for (var i = 0; i < m; i++) y[i, 0] = ydata[i]; 137 138 //Perform linear regression 139 var aTy = new double[n + 1, 1]; 140 alglib.rmatrixgemm(n + 1, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y; 141 var aTa = new double[n + 1, n + 1]; 142 alglib.rmatrixgemm(n + 1, n + 1, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa // 143 alglib.spdmatrixcholesky(ref aTa, n + 1, true); 144 int info; 145 alglib.densesolverreport report; 146 double[] coefficients; 147 var aTyVector = new double[n + 1]; 148 for (var i = 0; i < n + 1; i++) aTyVector[i] = aTy[i, 0]; 149 alglib.spdmatrixcholeskysolve(aTa, n + 1, true, aTyVector, out info, out report, out coefficients); 150 double rmse, cvrmse; 151 if (info != 1) return ClassicCalculation(pd, out rmse, out cvrmse); 152 153 //extract coefficients 154 var intercept = coefficients[n]; 155 var coeffs = new Dictionary<string, double>(); 156 for (var i = 0; i < n; i++) coeffs.Add(variables[i], coefficients[i]); 157 158 return new PreconstructedLinearModel(means, variances, coeffs, intercept, pd.TargetVariable); 159 } 160 161 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 162 return rows.Select(row => GetEstimatedValue(dataset, row)); 163 } 164 165 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { 166 return new RegressionSolution(this, problemData); 167 } 168 169 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 170 return rows.Select(i => GetEstimatedVariance(dataset, i)); 171 } 172 173 #region helpers 174 private double GetEstimatedValue(IDataset dataset, int row) { 175 return Intercept + (Coefficients.Count == 0 ? 0 : Coefficients.Sum(s => s.Value * dataset.GetDoubleValue(s.Key, row))); 176 } 177 private double GetEstimatedVariance(IDataset dataset, int row) { 178 if (SampleSize == 0) return 0.0; 179 var sum = (from var in Variances let d = dataset.GetDoubleValue(var.Key, row) - Means[var.Key] select d * d / var.Value).Sum(); 180 var res = ResidualVariance * (1.0 / SampleSize + sum / (SampleSize - 1)); 181 if (double.IsInfinity(res) || double.IsNaN(res)) return 0.0; 182 return res; 183 } 184 #endregion 185 } 186 } 187 No newline at end of file -
LeafModels/ComponentReducedLinearModel.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System.Collections.Generic; 23 using System.Linq; 24 using HeuristicLab.Common; 25 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 26 using HeuristicLab.Problems.DataAnalysis; 27 28 namespace HeuristicLab.Algorithms.DataAnalysis { 29 [StorableClass] 30 public class ComponentReducedLinearModel : RegressionModel, IConfidenceRegressionModel { 31 [Storable] 32 private IConfidenceRegressionModel Model; 33 [Storable] 34 private PrincipleComponentTransformation Pca; 35 36 [StorableConstructor] 37 private ComponentReducedLinearModel(bool deserializing) : base(deserializing) { } 38 private ComponentReducedLinearModel(ComponentReducedLinearModel original, Cloner cloner) : base(original, cloner) { 39 Model = cloner.Clone(original.Model); 40 Pca = cloner.Clone(original.Pca); 41 } 42 public ComponentReducedLinearModel(string targetVariable, IConfidenceRegressionModel model, PrincipleComponentTransformation pca) : base(targetVariable) { 43 Model = model; 44 Pca = pca; 45 } 46 public override IDeepCloneable Clone(Cloner cloner) { 47 return new ComponentReducedLinearModel(this, cloner); 48 } 49 50 public override IEnumerable<string> VariablesUsedForPrediction { 51 get { return Model.VariablesUsedForPrediction; } 52 } 53 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 54 var data = ReduceDataset(dataset, rows.ToArray()); 55 return Model.GetEstimatedValues(Pca.TransformDataset(data), Enumerable.Range(0, data.Rows)); 56 } 57 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { 58 return new ConfidenceRegressionSolution(this, problemData); 59 } 60 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 61 var data = ReduceDataset(dataset, rows.ToArray()); 62 return Model.GetEstimatedVariances(Pca.TransformDataset(data), Enumerable.Range(0, data.Rows)); 63 } 64 65 private IDataset ReduceDataset(IDataset data, IReadOnlyList<int> rows) { 66 return new Dataset(data.DoubleVariables, data.DoubleVariables.Select(v => data.GetDoubleValues(v, rows).ToList())); 67 } 68 } 69 } 70 No newline at end of file -
LeafModels/DampenedLinearModel.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 25 using HeuristicLab.Common; 26 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 27 using HeuristicLab.Problems.DataAnalysis; 28 29 namespace HeuristicLab.Algorithms.DataAnalysis { 30 //mulitdimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf 31 [StorableClass] 32 public class DampenedLinearModel : RegressionModel, IConfidenceRegressionModel { 33 [Storable] 34 private IConfidenceRegressionModel Model; 35 [Storable] 36 private double Min; 37 [Storable] 38 private double Max; 39 [Storable] 40 private double Dampening; 41 42 [StorableConstructor] 43 private DampenedLinearModel(bool deserializing) : base(deserializing) { } 44 private DampenedLinearModel(DampenedLinearModel original, Cloner cloner) : base(original, cloner) { 45 Model = cloner.Clone(original.Model); 46 Min = original.Min; 47 Max = original.Max; 48 Dampening = original.Dampening; 49 } 50 public DampenedLinearModel(IConfidenceRegressionModel model, IRegressionProblemData pd, double dampening) : base(model.TargetVariable) { 51 Model = model; 52 Min = pd.TargetVariableTrainingValues.Min(); 53 Max = pd.TargetVariableTrainingValues.Max(); 54 Dampening = dampening; 55 } 56 public override IDeepCloneable Clone(Cloner cloner) { 57 return new DampenedLinearModel(this, cloner); 58 } 59 public override IEnumerable<string> VariablesUsedForPrediction { 60 get { return Model.VariablesUsedForPrediction; } 61 } 62 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 63 var slow = Sigmoid(-Dampening); 64 var shigh = Sigmoid(Dampening); 65 foreach (var x in Model.GetEstimatedValues(dataset, rows)) { 66 var y = Rescale(x, Min, Max, -Dampening, Dampening); 67 y = Sigmoid(y); 68 y = Rescale(y, slow, shigh, Min, Max); 69 yield return y; 70 } 71 } 72 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { 73 return new ConfidenceRegressionSolution(this, problemData); 74 } 75 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 76 return Model.GetEstimatedVariances(dataset, rows); 77 } 78 79 private static double Rescale(double x, double oMin, double oMax, double nMin, double nMax) { 80 var d = oMax - oMin; 81 var nd = nMax - nMin; 82 if (d.IsAlmost(0)) { 83 d = 1; 84 nMin += nd / 2; 85 nd = 0; 86 } 87 return ((x - oMin) / d) * nd + nMin; 88 } 89 private static double Sigmoid(double x) { 90 return 1 / (1 + Math.Exp(-x)); 91 } 92 } 93 } 94 No newline at end of file -
LeafModels/PreconstructedLinearModel.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Diagnostics; 25 using System.Linq; 26 using HeuristicLab.Common; 27 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 28 using HeuristicLab.Problems.DataAnalysis; 29 30 namespace HeuristicLab.Algorithms.DataAnalysis { 31 //mulitdimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf 32 [StorableClass] 33 internal sealed class PreconstructedLinearModel : RegressionModel, IConfidenceRegressionModel { 34 [Storable] 35 public Dictionary<string, double> Coefficients { get; private set; } 36 [Storable] 37 public double Intercept { get; private set; } 38 [Storable] 39 private Dictionary<string, double> Means { get; set; } 40 [Storable] 41 private Dictionary<string, double> Variances { get; set; } 42 [Storable] 43 private double ResidualVariance { get; set; } 44 [Storable] 45 private int SampleSize { get; set; } 46 47 public override IEnumerable<string> VariablesUsedForPrediction { 48 get { return Coefficients.Keys; } 49 } 50 #region HLConstructors 51 [StorableConstructor] 52 private PreconstructedLinearModel(bool deserializing) : base(deserializing) { } 53 private PreconstructedLinearModel(PreconstructedLinearModel original, Cloner cloner) : base(original, cloner) { 54 if (original.Coefficients != null) Coefficients = original.Coefficients.ToDictionary(x => x.Key, x => x.Value); 55 Intercept = original.Intercept; 56 if (original.Means != null) Means = original.Means.ToDictionary(x => x.Key, x => x.Value); 57 if (original.Variances != null) Variances = original.Variances.ToDictionary(x => x.Key, x => x.Value); 58 ResidualVariance = original.ResidualVariance; 59 SampleSize = original.SampleSize; 60 } 61 public PreconstructedLinearModel(Dictionary<string, double> means, Dictionary<string, double> variances, Dictionary<string, double> coefficients, double intercept, string targetvariable) : base(targetvariable) { 62 Coefficients = coefficients; 63 Intercept = intercept; 64 Variances = variances; 65 Means = means; 66 ResidualVariance = 0; 67 SampleSize = 0; 68 } 69 public PreconstructedLinearModel(double intercept, string targetvariable) : base(targetvariable) { 70 Coefficients = new Dictionary<string, double>(); 71 Intercept = intercept; 72 Variances = new Dictionary<string, double>(); 73 ResidualVariance = 0; 74 SampleSize = 0; 75 } 76 public override IDeepCloneable Clone(Cloner cloner) { 77 return new PreconstructedLinearModel(this, cloner); 78 } 79 #endregion 80 81 public static PreconstructedLinearModel CreateConfidenceLinearModel(IRegressionProblemData pd, out double rmse, out double cvRmse) { 82 rmse = double.NaN; 83 cvRmse = double.NaN; 84 return AlternativeCalculation(pd); 85 } 86 87 private static PreconstructedLinearModel ClassicCalculation(IRegressionProblemData pd, out double rmse, out double cvRmse) { 88 var inputMatrix = pd.Dataset.ToArray(pd.AllowedInputVariables.Concat(new[] { 89 pd.TargetVariable 90 }), pd.AllIndices); 91 92 var nFeatures = inputMatrix.GetLength(1) - 1; 93 double[] coefficients; 94 95 alglib.linearmodel lm; 96 alglib.lrreport ar; 97 int retVal; 98 alglib.lrbuild(inputMatrix, inputMatrix.GetLength(0), nFeatures, out retVal, out lm, out ar); 99 if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution"); 100 rmse = ar.rmserror; 101 cvRmse = ar.cvrmserror; 102 103 alglib.lrunpack(lm, out coefficients, out nFeatures); 104 105 106 var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Average()); 107 var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n).Variance()); 108 var coeffs = pd.AllowedInputVariables.Zip(coefficients, (s, d) => new {s, d}).ToDictionary(x => x.s, x => x.d); 109 var res = new PreconstructedLinearModel(means, variances, coeffs, coefficients[nFeatures], pd.TargetVariable); 110 111 res.ResidualVariance = pd.TargetVariableValues.Zip(res.GetEstimatedValues(pd.Dataset, pd.TrainingIndices), (x, y) => x - y).Variance(); 112 res.SampleSize = pd.TrainingIndices.Count(); 113 return res; 114 } 115 116 private static PreconstructedLinearModel AlternativeCalculation(IRegressionProblemData pd) { 117 var means = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Average()); 118 var variances = pd.AllowedInputVariables.ToDictionary(n1 => n1, n1 => pd.Dataset.GetDoubleValues(n1).Variance()); 119 var cmean = pd.TargetVariableTrainingValues.Average(); 120 var variables = pd.AllowedInputVariables.ToList(); 121 var n = variables.Count; 122 var m = pd.TrainingIndices.Count(); 123 124 //Set up X^T and y 125 var inTr = new double[n + 1, m]; 126 for (var i = 0; i < n; i++) { 127 var v = variables[i]; 128 var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray(); 129 for (var j = 0; j < m; j++) inTr[i, j] = vdata[j]; 130 } 131 132 for (var i = 0; i < m; i++) inTr[n, i] = 1; 133 134 var y = new double[m, 1]; 135 var ydata = pd.TargetVariableTrainingValues.ToArray(); 136 for (var i = 0; i < m; i++) y[i, 0] = ydata[i]; 137 138 //Perform linear regression 139 var aTy = new double[n + 1, 1]; 140 alglib.rmatrixgemm(n + 1, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y; 141 var aTa = new double[n + 1, n + 1]; 142 alglib.rmatrixgemm(n + 1, n + 1, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa // 143 alglib.spdmatrixcholesky(ref aTa, n + 1, true); 144 int info; 145 alglib.densesolverreport report; 146 double[] coefficients; 147 var aTyVector = new double[n + 1]; 148 for (var i = 0; i < n + 1; i++) aTyVector[i] = aTy[i, 0]; 149 alglib.spdmatrixcholeskysolve(aTa, n + 1, true, aTyVector, out info, out report, out coefficients); 150 double rmse, cvrmse; 151 if (info != 1) return ClassicCalculation(pd, out rmse, out cvrmse); 152 153 //extract coefficients 154 var intercept = coefficients[n]; 155 var coeffs = new Dictionary<string, double>(); 156 for (var i = 0; i < n; i++) coeffs.Add(variables[i], coefficients[i]); 157 158 return new PreconstructedLinearModel(means, variances, coeffs, intercept, pd.TargetVariable); 159 } 160 161 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 162 return rows.Select(row => GetEstimatedValue(dataset, row)); 163 } 164 165 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { 166 return new RegressionSolution(this, problemData); 167 } 168 169 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 170 return rows.Select(i => GetEstimatedVariance(dataset, i)); 171 } 172 173 #region helpers 174 private double GetEstimatedValue(IDataset dataset, int row) { 175 return Intercept + (Coefficients.Count == 0 ? 0 : Coefficients.Sum(s => s.Value * dataset.GetDoubleValue(s.Key, row))); 176 } 177 private double GetEstimatedVariance(IDataset dataset, int row) { 178 if (SampleSize == 0) return 0.0; 179 var sum = (from var in Variances let d = dataset.GetDoubleValue(var.Key, row) - Means[var.Key] select d * d / var.Value).Sum(); 180 var res = ResidualVariance * (1.0 / SampleSize + sum / (SampleSize - 1)); 181 if (double.IsInfinity(res) || double.IsNaN(res)) return 0.0; 182 return res; 183 } 184 #endregion 185 } 186 } 187 No newline at end of file -
LeafTypes/ComplexLeaf.cs
66 66 if (t == null) throw new ArgumentException("No RegressionSolution was provided by the algorithm"); 67 67 return t.Model; 68 68 } 69 70 69 public int MinLeafSize(IRegressionProblemData pd) { 71 70 return 3; 72 71 } -
LeafTypes/LinearLeaf.cs
49 49 if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model"); 50 50 double rmse, cvRmse; 51 51 noParameters = pd.AllowedInputVariables.Count() + 1; 52 return PreconstructedLinearModel.CreateConfidenceLinearModel(pd, out rmse, out cvRmse); 52 var res = PreconstructedLinearModel.CreateConfidenceLinearModel(pd, out rmse, out cvRmse); 53 return res; 53 54 } 54 55 55 56 public int MinLeafSize(IRegressionProblemData pd) { 56 return pd.AllowedInputVariables.Count() + 2;57 return pd.AllowedInputVariables.Count() == 1 ? 2 : pd.AllowedInputVariables.Count() + 2; 57 58 } 58 59 #endregion 59 60 } -
LeafTypes/LogisticLeaf.cs
58 58 get { return true; } 59 59 } 60 60 public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) { 61 if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model"); 62 double rmse, cvRmse; 63 noParameters = pd.AllowedInputVariables.Count() + 1; 64 return new DampenedLinearModel(PreconstructedLinearModel.CreateConfidenceLinearModel(pd, out rmse, out cvRmse), pd, Dampening); 61 var res = (IConfidenceRegressionModel)new LinearLeaf().Build(pd, random, cancellationToken, out noParameters); 62 return new DampenedLinearModel(res, pd, Dampening); 65 63 } 66 64 67 65 public int MinLeafSize(IRegressionProblemData pd) { -
LeafTypes/M5Leaf.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 25 using System.Threading; 26 using HeuristicLab.Common; 27 using HeuristicLab.Core; 28 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 29 using HeuristicLab.Problems.DataAnalysis; 30 31 namespace HeuristicLab.Algorithms.DataAnalysis { 32 [StorableClass] 33 [Item("M5Leaf", "A leaf type that uses linear models as leaf models. This is the standard for M5' regression")] 34 public class M5Leaf : ParameterizedNamedItem, ILeafModel { 35 #region Constructors & Cloning 36 [StorableConstructor] 37 private M5Leaf(bool deserializing) : base(deserializing) { } 38 private M5Leaf(M5Leaf original, Cloner cloner) : base(original, cloner) { } 39 public M5Leaf() { } 40 public override IDeepCloneable Clone(Cloner cloner) { 41 return new M5Leaf(this, cloner); 42 } 43 #endregion 44 45 #region IModelType 46 public bool ProvidesConfidence { 47 get { return false; } 48 } 49 public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) { 50 if (pd.Dataset.Rows == 0) throw new ArgumentException("The number of training instances is too small to create an M5 leaf model"); 51 52 if (pd.Dataset.Rows == 1) 53 return new ConstantLeaf().Build(pd, random, cancellationToken, out noParameters); 54 55 var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Average()); 56 var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Variance()); 57 var used = pd.AllowedInputVariables.Where(v => !variances[v].IsAlmost(0.0)).ToList(); 58 59 var classMean = pd.TargetVariableTrainingValues.Average(); 60 var classVar = pd.TargetVariableTrainingValues.Variance(); 61 62 var model = FindBestModel(variances, means, classMean, classVar, pd, used); 63 noParameters = 1 + model.Coefficients.Count; 64 return model; 65 } 66 67 68 private static PreconstructedLinearModel FindBestModel(Dictionary<string, double> variances, Dictionary<string, double> means, double cMean, double cVar, IRegressionProblemData pd, IList<string> variables) { 69 Dictionary<string, double> coeffs; 70 double intercept; 71 do { 72 coeffs = DoRegression(pd, variables, variances, means, cMean, 1.0e-8, out intercept); 73 variables = DeselectColinear(variances, coeffs, cVar, pd, variables); 74 } 75 while (coeffs.Count != variables.Count); 76 var numAtts = variables.Count; 77 var numInst = pd.TrainingIndices.Count(); 78 var fullMse = CalculateSE(coeffs, intercept, pd, variables); 79 var akaike = 1.0 * (numInst - numAtts) + 2 * numAtts; 80 81 var improved = true; 82 var currentNumAttributes = numAtts; 83 84 while (improved && currentNumAttributes > 1) { 85 improved = false; 86 currentNumAttributes--; 87 // Find attribute with smallest SC 88 var candidate = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / cVar))) 89 .OrderBy(x => x.Value).Select(x => x.Key).First(); 90 91 var currVariables = variables.Where(v => !v.Equals(candidate)).ToList(); 92 var currentIntercept = 0.0; 93 var currentCoeffs = DoRegression(pd, currVariables, variances, means, cMean, 1.0e-8, out currentIntercept); 94 var currentMse = CalculateSE(currentCoeffs, currentIntercept, pd, currVariables); 95 var currentAkaike = currentMse / fullMse * (numInst - numAtts) + 2 * currentNumAttributes; 96 97 if (!(currentAkaike < akaike)) continue; 98 improved = true; 99 akaike = currentAkaike; 100 coeffs = currentCoeffs; 101 intercept = currentIntercept; 102 variables = currVariables; 103 } 104 105 var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable); 106 pd2.TestPartition.End = pd.TestPartition.End; 107 pd2.TestPartition.Start = pd.TestPartition.Start; 108 pd2.TrainingPartition.End = pd.TrainingPartition.End; 109 pd2.TrainingPartition.Start = pd.TrainingPartition.Start; 110 111 return new PreconstructedLinearModel(means, variances, coeffs, intercept, pd.TargetVariable); 112 } 113 114 115 private static Dictionary<string, double> DoRegression(IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> variances, Dictionary<string, double> means, double cmean, double ridge, out double intercept) { 116 //if (pd.TrainingIndices.Count() > variables.Count) { 117 // var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable); 118 // pd2.TestPartition.End = pd.TestPartition.End; 119 // pd2.TestPartition.Start = pd.TestPartition.Start; 120 // pd2.TrainingPartition.End = pd.TrainingPartition.End; 121 // pd2.TrainingPartition.Start = pd.TrainingPartition.Start; 122 // 123 // double x1, x2; 124 // var lm = PreconstructedLinearModel.CreateConfidenceLinearModel(pd2, out x1, out x2); 125 // intercept = lm.Intercept; 126 // return lm.Coefficients; 127 128 var n = variables.Count; 129 var m = pd.TrainingIndices.Count(); 130 131 var inTr = new double[n, m]; 132 for (var i = 0; i < n; i++) { 133 var v = variables[i]; 134 var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray(); 135 var sd = Math.Sqrt(variances[v]); 136 var mean = means[v]; 137 for (var j = 0; j < m; j++) { 138 inTr[i, j] = (vdata[j] - mean) / sd; 139 } 140 } 141 142 var y = new double[m, 1]; 143 var ydata = pd.TargetVariableTrainingValues.ToArray(); 144 for (var i = 0; i < m; i++) 145 y[i, 0] = ydata[i]; //no scaling for targets; 146 147 148 var aTy = new double[n, 1]; 149 alglib.rmatrixgemm(n, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y; 150 var aTa = new double[n, n]; 151 alglib.rmatrixgemm(n, n, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa // 152 153 var aTaDecomp = new double[n, n]; 154 bool success; 155 var tries = 0; 156 double[] coefficients = null; 157 do { 158 for (var i = 0; i < n; i++) aTa[i, i] += ridge; // add ridge to diagonal to enforce singularity 159 try { 160 //solve "aTa * coefficients = aTy" for coefficients; 161 Array.Copy(aTa, 0, aTaDecomp, 0, aTa.Length); 162 alglib.spdmatrixcholesky(ref aTaDecomp, n, true); 163 int info; 164 alglib.densesolverreport report; 165 alglib.spdmatrixcholeskysolve(aTaDecomp, n, true, ydata, out info, out report, out coefficients); 166 167 if (info != 1) throw new Exception(); 168 success = true; 169 } 170 catch (Exception) { 171 for (var i = 0; i < n; i++) aTa[i, i] -= ridge; 172 ridge *= 10; // increase ridge; 173 success = false; 174 } 175 finally { 176 tries++; 177 } 178 } 179 while (!success && tries < 100); 180 if (coefficients == null) throw new ArgumentException("No linear model could be built"); 181 182 intercept = cmean; 183 var res = new Dictionary<string, double>(); 184 for (var i = 0; i < n; i++) { 185 var v = variables[i]; 186 res.Add(v, coefficients[i] /= Math.Sqrt(variances[v])); 187 intercept -= coefficients[i] * means[v]; 188 } 189 190 return res; 191 } 192 193 private static IList<string> DeselectColinear(Dictionary<string, double> variances, Dictionary<string, double> coeffs, double cVar, IRegressionProblemData pd, IList<string> variables) { 194 var candidates = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / cVar))).Where(x => x.Value > 1.5).OrderBy(x => -x.Value).ToList(); 195 if (candidates.Count == 0) return variables; 196 var c = candidates.First().Key; 197 return variables.Where(v => !v.Equals(c)).ToList(); 198 } 199 private static double CalculateSE(Dictionary<string, double> coefficients, double intercept, IRegressionProblemData pd, IList<string> variables) { 200 return pd.TrainingIndices.Select(i => RegressionPrediction(i, pd, variables, coefficients, intercept) - pd.Dataset.GetDoubleValue(pd.TargetVariable, i)).Select(error => error * error).Sum(); 201 } 202 private static double RegressionPrediction(int i, IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> coefficients, double intercept) { 203 return intercept + variables.Sum(v => pd.Dataset.GetDoubleValue(v, i) * coefficients[v]); 204 } 205 public int MinLeafSize(IRegressionProblemData pd) { 206 return 1; 207 } 208 #endregion 209 } 210 } 211 No newline at end of file -
LeafTypes/M5regLeaf.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Linq; 24 using System.Threading; 25 using HeuristicLab.Algorithms.DataAnalysis.Glmnet; 26 using HeuristicLab.Common; 27 using HeuristicLab.Core; 28 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 29 using HeuristicLab.Problems.DataAnalysis; 30 31 namespace HeuristicLab.Algorithms.DataAnalysis { 32 [StorableClass] 33 [Item("M5regLeaf", "A leaf type that uses linear models as leaf models. This is the standard for M5' regression")] 34 public class M5regLeaf : ParameterizedNamedItem, ILeafModel { 35 #region Constructors & Cloning 36 [StorableConstructor] 37 private M5regLeaf(bool deserializing) : base(deserializing) { } 38 private M5regLeaf(M5regLeaf original, Cloner cloner) : base(original, cloner) { } 39 public M5regLeaf() { } 40 public override IDeepCloneable Clone(Cloner cloner) { 41 return new M5regLeaf(this, cloner); 42 } 43 #endregion 44 45 #region IModelType 46 public bool ProvidesConfidence { 47 get { return true; } 48 } 49 public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) { 50 if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model"); 51 double rmse, cvRmse; 52 noParameters = pd.AllowedInputVariables.Count() + 1; 53 54 double x1, x2; 55 var coeffs = ElasticNetLinearRegression.CalculateModelCoefficients(pd, 1, 0.2, out x1, out x2); 56 noParameters = coeffs.Length; 57 return ElasticNetLinearRegression.CreateSymbolicSolution(coeffs, pd).Model; 58 } 59 60 public int MinLeafSize(IRegressionProblemData pd) { 61 return pd.AllowedInputVariables.Count() + 2; 62 } 63 #endregion 64 } 65 } 66 No newline at end of file -
M5Regression.cs
21 21 #region Parametername 22 22 private const string GenerateRulesParameterName = "GenerateRules"; 23 23 private const string HoldoutSizeParameterName = "HoldoutSize"; 24 private const string SpliterParameterName = "Split er";24 private const string SpliterParameterName = "Splitter"; 25 25 private const string MinimalNodeSizeParameterName = "MinimalNodeSize"; 26 26 private const string LeafModelParameterName = "LeafModel"; 27 27 private const string PruningTypeParameterName = "PruningType"; … … 37 37 public IFixedValueParameter<PercentValue> HoldoutSizeParameter { 38 38 get { return (IFixedValueParameter<PercentValue>)Parameters[HoldoutSizeParameterName]; } 39 39 } 40 public IConstrainedValueParameter<ISplit er> ImpurityParameter {41 get { return (IConstrainedValueParameter<ISplit er>)Parameters[SpliterParameterName]; }40 public IConstrainedValueParameter<ISplitter> ImpurityParameter { 41 get { return (IConstrainedValueParameter<ISplitter>)Parameters[SpliterParameterName]; } 42 42 } 43 43 public IFixedValueParameter<IntValue> MinimalNodeSizeParameter { 44 44 get { return (IFixedValueParameter<IntValue>)Parameters[MinimalNodeSizeParameterName]; } … … 67 67 public double HoldoutSize { 68 68 get { return HoldoutSizeParameter.Value.Value; } 69 69 } 70 public ISplit er Split {70 public ISplitter Split { 71 71 get { return ImpurityParameter.Value; } 72 72 } 73 73 public int MinimalNodeSize { … … 97 97 public M5Regression() { 98 98 var modelSet = new ItemSet<ILeafModel>(ApplicationManager.Manager.GetInstances<ILeafModel>()); 99 99 var pruningSet = new ItemSet<IPruning>(ApplicationManager.Manager.GetInstances<IPruning>()); 100 var impuritySet = new ItemSet<ISplit er>(ApplicationManager.Manager.GetInstances<ISpliter>());100 var impuritySet = new ItemSet<ISplitter>(ApplicationManager.Manager.GetInstances<ISplitter>()); 101 101 Parameters.Add(new FixedValueParameter<BoolValue>(GenerateRulesParameterName, "Whether a set of rules or a decision tree shall be created", new BoolValue(false))); 102 102 Parameters.Add(new FixedValueParameter<PercentValue>(HoldoutSizeParameterName, "How much of the training set shall be reserved for pruning", new PercentValue(0.2))); 103 Parameters.Add(new ConstrainedValueParameter<ISplit er>(SpliterParameterName, "The type of split function used to create node splits", impuritySet, impuritySet.OfType<M5Spliter>().First()));103 Parameters.Add(new ConstrainedValueParameter<ISplitter>(SpliterParameterName, "The type of split function used to create node splits", impuritySet, impuritySet.OfType<M5Splitter>().First())); 104 104 Parameters.Add(new FixedValueParameter<IntValue>(MinimalNodeSizeParameterName, "The minimal number of samples in a leaf node", new IntValue(1))); 105 105 Parameters.Add(new ConstrainedValueParameter<ILeafModel>(LeafModelParameterName, "The type of model used for the nodes", modelSet, modelSet.OfType<LinearLeaf>().First())); 106 106 Parameters.Add(new ConstrainedValueParameter<IPruning>(PruningTypeParameterName, "The type of pruning used", pruningSet, pruningSet.OfType<M5LinearBottomUpPruning>().First())); … … 119 119 if (SetSeedRandomly) SeedParameter.Value.Value = new System.Random().Next(); 120 120 random.Reset(Seed); 121 121 var solution = CreateM5RegressionSolution(Problem.ProblemData, random, LeafModel, Split, Pruning, UseHoldout, HoldoutSize, MinimalNodeSize, GenerateRules, Results, cancellationToken); 122 AnalyzeSolution(solution );122 AnalyzeSolution(solution, Results, Problem.ProblemData); 123 123 } 124 124 125 125 #region Static Interface 126 126 public static IRegressionSolution CreateM5RegressionSolution(IRegressionProblemData problemData, IRandom random, 127 ILeafModel leafModel = null, ISplit er spliter = null, IPruning pruning = null,127 ILeafModel leafModel = null, ISplitter splitter = null, IPruning pruning = null, 128 128 bool useHoldout = false, double holdoutSize = 0.2, int minNumInstances = 4, bool generateRules = false, ResultCollection results = null, CancellationToken? cancellationToken = null) { 129 129 //set default values 130 130 if (leafModel == null) leafModel = new LinearLeaf(); 131 if (split er == null) spliter = new M5Spliter();131 if (splitter == null) splitter = new M5Splitter(); 132 132 if (cancellationToken == null) cancellationToken = CancellationToken.None; 133 133 if (pruning == null) pruning = new M5LeafBottomUpPruning(); 134 134 135 //reduce RegressionProblemData to AllowedInput & Target column wise and to TrainingSet row wise 135 136 var doubleVars = new HashSet<string>(problemData.Dataset.DoubleVariables); 136 137 var vars = problemData.AllowedInputVariables.Concat(new[] {problemData.TargetVariable}).ToArray(); 137 138 if (vars.Any(v => !doubleVars.Contains(v))) throw new NotSupportedException("M5 regression supports only double valued input or output features."); 138 139 var values = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray(); 140 if (values.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x)))) 139 var doubles = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray(); 140 if (doubles.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x)))) 141 141 throw new NotSupportedException("M5 regression does not support NaN or infinity values in the input dataset."); 142 143 var trainingData = new Dataset(vars, values); 142 var trainingData = new Dataset(vars, doubles); 144 143 var pd = new RegressionProblemData(trainingData, problemData.AllowedInputVariables, problemData.TargetVariable); 145 144 pd.TrainingPartition.End = pd.TestPartition.Start = pd.TestPartition.End = pd.Dataset.Rows; 146 145 pd.TrainingPartition.Start = 0; 147 146 148 //create & build Model 149 var m5Params = new M5Parameters(pruning, minNumInstances, leafModel, pd, random, spliter, results); 150 147 //intialize M5Parameters and pruning set 148 var m5Params = new M5Parameters(pruning, minNumInstances, leafModel, pd, random, splitter, results); 151 149 IReadOnlyList<int> trainingRows, pruningRows; 152 150 GeneratePruningSet(problemData.TrainingIndices.ToArray(), random, useHoldout, holdoutSize, out trainingRows, out pruningRows); 153 151 152 //create & build Model 154 153 IM5Model model; 155 if (generateRules) 156 model = M5RuleSetModel.CreateRuleModel(problemData.TargetVariable, m5Params); 157 else 158 model = M5TreeModel.CreateTreeModel(problemData.TargetVariable, m5Params); 154 if (generateRules) model = M5RuleSetModel.CreateRuleModel(problemData.TargetVariable, m5Params); 155 else model = M5TreeModel.CreateTreeModel(problemData.TargetVariable, m5Params); 156 model.Build(trainingRows, pruningRows, m5Params, cancellationToken.Value); 159 157 160 model.Build(trainingRows, pruningRows, m5Params, cancellationToken.Value);161 158 return model.CreateRegressionSolution(problemData); 162 159 } 163 160 164 public static void UpdateM5Model(IRegressionModel model, IRegressionProblemData problemData, IRandom random, 165 ILeafModel leafModel, CancellationToken? cancellationToken = null) { 166 var m5Model = model as IM5Model; 167 if (m5Model == null) throw new ArgumentException("This type of model can not be updated"); 168 UpdateM5Model(m5Model, problemData, random, leafModel, cancellationToken); 169 } 170 171 private static void UpdateM5Model(IM5Model model, IRegressionProblemData problemData, IRandom random, 172 ILeafModel leafModel = null, CancellationToken? cancellationToken = null) { 161 public static void UpdateM5Model(IM5Model model, IRegressionProblemData problemData, IRandom random, ILeafModel leafModel, CancellationToken? cancellationToken = null) { 173 162 if (cancellationToken == null) cancellationToken = CancellationToken.None; 174 163 var m5Params = new M5Parameters(leafModel, problemData, random); 175 164 model.Update(problemData.TrainingIndices.ToList(), m5Params, cancellationToken.Value); … … 189 178 training = perm.Take(cut).Select(i => allrows[i]).ToArray(); 190 179 } 191 180 192 private void AnalyzeSolution(IRegressionSolution solution) {181 private static void AnalyzeSolution(IRegressionSolution solution, ResultCollection Results, IRegressionProblemData problemData) { 193 182 Results.Add(new Result("RegressionSolution", (IItem)solution.Clone())); 194 183 195 Dictionary<string, int> frequencies; 196 if (!GenerateRules) { 197 Results.Add(M5Analyzer.CreateLeafDepthHistogram((M5TreeModel)solution.Model)); 198 frequencies = M5Analyzer.GetTreeVariableFrequences((M5TreeModel)solution.Model); 184 Dictionary<string, int> frequencies = null; 185 186 var tree = solution.Model as M5TreeModel; 187 if (tree != null) { 188 Results.Add(M5Analyzer.CreateLeafDepthHistogram(tree)); 189 frequencies = M5Analyzer.GetTreeVariableFrequences(tree); 190 M5Analyzer.AnalyzeNodes(tree, Results, problemData); 199 191 } 200 else { 201 Results.Add(M5Analyzer.CreateRulesResult((M5RuleSetModel)solution.Model, Problem.ProblemData, "M5TreeResult", true)); 202 frequencies = M5Analyzer.GetRuleVariableFrequences((M5RuleSetModel)solution.Model); 203 Results.Add(M5Analyzer.CreateCoverageDiagram((M5RuleSetModel)solution.Model, Problem.ProblemData)); 192 193 var ruleSet = solution.Model as M5RuleSetModel; 194 if (ruleSet != null) { 195 Results.Add(M5Analyzer.CreateRulesResult(ruleSet, problemData, "M5Rules", true)); 196 frequencies = M5Analyzer.GetRuleVariableFrequences(ruleSet); 197 Results.Add(M5Analyzer.CreateCoverageDiagram(ruleSet, problemData)); 204 198 } 205 199 206 200 //Variable frequencies 207 var sum = frequencies.Values.Sum(); 208 sum = sum == 0 ? 1 : sum; 209 var impactArray = new DoubleArray(frequencies.Select(i => (double)i.Value / sum).ToArray()) { 210 ElementNames = frequencies.Select(i => i.Key) 211 }; 212 Results.Add(new Result("Variable Frequences", "relative frequencies of variables in rules and tree nodes", impactArray)); 201 if (frequencies != null) { 202 var sum = frequencies.Values.Sum(); 203 sum = sum == 0 ? 1 : sum; 204 var impactArray = new DoubleArray(frequencies.Select(i => (double)i.Value / sum).ToArray()) { 205 ElementNames = frequencies.Select(i => i.Key) 206 }; 207 Results.Add(new Result("Variable Frequences", "relative frequencies of variables in rules and tree nodes", impactArray)); 208 } 213 209 } 214 210 #endregion 215 211 } -
M5Utilities/M5Analyzer.cs
22 22 using System.Collections.Generic; 23 23 using System.Linq; 24 24 using HeuristicLab.Analysis; 25 using HeuristicLab.Common; 25 26 using HeuristicLab.Data; 27 using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; 26 28 using HeuristicLab.Optimization; 29 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 27 30 using HeuristicLab.Problems.DataAnalysis; 31 using HeuristicLab.Problems.DataAnalysis.Symbolic; 28 32 29 33 namespace HeuristicLab.Algorithms.DataAnalysis { 30 internalstatic class M5Analyzer {34 public static class M5Analyzer { 31 35 private const string ConditionResultName = "Condition"; 32 36 private const string CoverResultName = "Covered Instances"; 33 37 private const string CoverageDiagramResultName = "Coverage"; … … 52 56 public static Result CreateLeafDepthHistogram(M5TreeModel treeModel) { 53 57 var list = new List<int>(); 54 58 GetLeafDepths(treeModel.Root, 0, list); 55 var row = new DataRow("Depths", "", list.Select(x => (double) 59 var row = new DataRow("Depths", "", list.Select(x => (double)x)) { 56 60 VisualProperties = {ChartType = DataRowVisualProperties.DataRowChartType.Histogram} 57 61 }; 58 62 var hist = new DataTable("LeafDepths"); … … 130 134 } 131 135 return res; 132 136 } 137 138 public static void AnalyzeNodes(M5TreeModel tree, ResultCollection results, IRegressionProblemData pd) { 139 var dict = new Dictionary<int, M5NodeModel>(); 140 var modelNumber = new IntValue(1); 141 var symtree = new SymbolicExpressionTree(MirrorTree(tree.Root, dict, modelNumber, pd.Dataset, pd.TrainingIndices.ToList())); 142 results.AddOrUpdateResult("DecisionTree", symtree); 143 144 if (dict.Count > 200) return; 145 var models = new ResultCollection(); 146 results.AddOrUpdateResult("NodeModels", models); 147 foreach (var m in dict.Keys.OrderBy(x => x)) { 148 models.AddOrUpdateResult("Model " + m, dict[m].CreateRegressionSolution(pd)); 149 } 150 } 151 152 private static SymbolicExpressionTreeNode MirrorTree(M5NodeModel node, IDictionary<int, M5NodeModel> dict, IntValue nextId, IDataset data, IReadOnlyList<int> rows) { 153 if (node.IsLeaf) { 154 var i = nextId.Value++; 155 dict.Add(i, node); 156 return new SymbolicExpressionTreeNode(new TextSymbol("Model " + i + " " + rows.Count + " Instances")); 157 } 158 159 var text = node.SplitAttribute + " <= " + node.SplitValue.ToString("0.###") + " pf= " + node.PruningStrength.ToString("0.###"); 160 var textNode = new SymbolicExpressionTreeNode(new TextSymbol(text)); 161 IReadOnlyList<int> lrows, rrows; 162 M5StaticUtilities.SplitRows(rows, data, node.SplitAttribute, node.SplitValue, out lrows, out rrows); 163 164 textNode.AddSubtree(MirrorTree(node.Left, dict, nextId, data, lrows)); 165 textNode.AddSubtree(MirrorTree(node.Right, dict, nextId, data, rrows)); 166 167 return textNode; 168 } 169 170 171 [StorableClass] 172 private class TextSymbol : Symbol { 173 [StorableConstructor] 174 public TextSymbol(bool deserializing) : base(deserializing) { } 175 public TextSymbol(Symbol original, Cloner cloner) : base(original, cloner) { } 176 public TextSymbol(string name) : base(name, "") { 177 this.Name = name; 178 } 179 public override IDeepCloneable Clone(Cloner cloner) { 180 return new TextSymbol(this, cloner); 181 } 182 public override int MinimumArity { 183 get { return 0; } 184 } 185 public override int MaximumArity { 186 get { return int.MaxValue; } 187 } 188 } 133 189 } 134 190 } 191 No newline at end of file -
M5Utilities/M5Parameters.cs
26 26 using HeuristicLab.Problems.DataAnalysis; 27 27 28 28 namespace HeuristicLab.Algorithms.DataAnalysis { 29 internalclass M5Parameters {30 private readonly ISplit er splitter;29 public class M5Parameters { 30 private readonly ISplitter splitter; 31 31 private readonly IPruning pruning; 32 32 private readonly ILeafModel leafModel; 33 33 private readonly int minLeafSize; … … 34 34 private readonly IRegressionProblemData problemData; 35 35 private readonly IRandom random; 36 36 private readonly ResultCollection results; 37 public ISplit er Spliter {37 public ISplitter Splitter { 38 38 get { return splitter; } 39 39 } 40 40 public IPruning Pruning { … … 66 66 } 67 67 68 68 public M5Parameters(IPruning pruning, int minleafSize, ILeafModel leafModel, 69 IRegressionProblemData problemData, IRandom random, ISplit er splitter, ResultCollection results) {69 IRegressionProblemData problemData, IRandom random, ISplitter splitter, ResultCollection results) { 70 70 this.problemData = problemData; 71 71 this.random = random; 72 72 this.leafModel = leafModel; -
M5Utilities/M5StaticUtilities.cs
42 42 } 43 43 if (alg.ExecutionState != ExecutionState.Paused) alg.Prepare(); 44 44 alg.Start(cancellationToken); 45 return alg.Results; 45 var res = alg.Results; 46 alg.Runs.Clear(); 47 return res; 46 48 } 47 49 48 50 public static void SplitRows(IReadOnlyList<int> rows, IDataset data, string splitAttr, double splitValue, out IReadOnlyList<int> leftRows, out IReadOnlyList<int> rightRows) { 49 //TODO check and revert points at borders are now used multipe times51 //TODO check and revert?: points at borders are now used multipe times 50 52 var assignment = data.GetDoubleValues(splitAttr, rows).Select(x => x.IsAlmost(splitValue) ? 2 : x < splitValue ? 0 : 1).ToArray(); 51 53 leftRows = rows.Zip(assignment, (i, b) => new {i, b}).Where(x => x.b == 0 || x.b == 2).Select(x => x.i).ToList(); 52 54 rightRows = rows.Zip(assignment, (i, b) => new {i, b}).Where(x => x.b > 0).Select(x => x.i).ToList(); -
MetaModels/M5NodeModel.cs
31 31 32 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 33 [StorableClass] 34 internalclass M5NodeModel : RegressionModel {34 public class M5NodeModel : RegressionModel { 35 35 #region Properties 36 public double PruningStrength; 37 36 38 [Storable] 37 39 internal bool IsLeaf { get; private set; } 38 40 [Storable] … … 106 108 SplitValue = double.NaN; 107 109 string attr; 108 110 double splitValue; 109 IsLeaf = !m5Params.Split er.Split(new RegressionProblemData(M5StaticUtilities.ReduceDataset(m5Params.Data, rows, variables, TargetVariable), variables, TargetVariable), m5Params.MinLeafSize, out attr, out splitValue);111 IsLeaf = !m5Params.Splitter.Split(new RegressionProblemData(M5StaticUtilities.ReduceDataset(m5Params.Data, rows, variables, TargetVariable), variables, TargetVariable), m5Params.MinLeafSize, out attr, out splitValue); 110 112 if (IsLeaf) return; 111 113 112 114 //split Dataset -
MetaModels/M5RuleModel.cs
31 31 32 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 33 [StorableClass] 34 internalclass M5RuleModel : RegressionModel {34 public class M5RuleModel : RegressionModel { 35 35 #region Properties 36 36 [Storable] 37 37 internal string[] SplitAttributes { get; private set; } -
MetaModels/M5RuleSetModel.cs
31 31 32 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 33 [StorableClass] 34 internalclass M5RuleSetModel : RegressionModel, IM5Model {34 public class M5RuleSetModel : RegressionModel, IM5Model { 35 35 private const string NumRulesResultName = "Number of rules"; 36 36 private const string CoveredInstancesResultName = "Covered instances"; 37 37 -
MetaModels/M5TreeModel.cs
31 31 32 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 33 [StorableClass] 34 internalclass M5TreeModel : RegressionModel, IM5Model {34 public class M5TreeModel : RegressionModel, IM5Model { 35 35 public const string NumCurrentLeafsResultName = "Number of current leafs"; 36 36 #region Properties 37 37 [Storable] … … 69 69 70 70 #region IM5Model 71 71 public void Build(IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) { 72 //create intial (overfitted tree) 72 73 Root = M5NodeModel.CreateNode(m5Params.TargetVariable, m5Params); 73 74 Root.Split(trainingRows, m5Params); 74 75 75 InitializeLeafCounter(m5Params); 76 //intitalize leafs counter 77 var leafs = Root.EnumerateNodes().Count(x => x.IsLeaf); 78 if (!m5Params.Results.ContainsKey(NumCurrentLeafsResultName)) 79 m5Params.Results.Add(new Result(NumCurrentLeafsResultName, new IntValue(leafs))); 80 else ((IntValue)m5Params.Results[NumCurrentLeafsResultName].Value).Value = leafs; 76 81 77 var buPruner = m5Params.Pruning as BottomUpPruningBase;78 if (buPruner != null) buPruner.Prune(this, trainingRows, pruningRows, m5Params, cancellationToken);82 //prune 83 m5Params.Pruning.Prune(this, trainingRows, pruningRows, m5Params, cancellationToken); 79 84 85 //build final leaf models 80 86 Root.BuildLeafModels(trainingRows.Union(pruningRows).ToArray(), m5Params, cancellationToken); 81 87 } 82 88 … … 85 91 } 86 92 #endregion 87 93 88 #region Helpers89 private void InitializeLeafCounter(M5Parameters m5Params) {90 if (!m5Params.Results.ContainsKey(NumCurrentLeafsResultName))91 m5Params.Results.Add(new Result(NumCurrentLeafsResultName, new IntValue(Root.EnumerateNodes().Count(x => x.IsLeaf))));92 else ((IntValue)m5Params.Results[NumCurrentLeafsResultName].Value).Value = Root.EnumerateNodes().Count(x => x.IsLeaf);93 }94 #endregion95 96 94 [StorableClass] 97 95 private class ConfidenceM5TreeModel : M5TreeModel, IConfidenceRegressionModel { 98 96 #region HLConstructors & Cloning -
Pruning/BottomUpPruningBase.cs
19 19 */ 20 20 #endregion 21 21 22 using System; 22 23 using System.Collections.Generic; 23 24 using System.Linq; 24 25 using System.Threading; … … 46 47 protected BottomUpPruningBase(bool deserializing) : base(deserializing) { } 47 48 protected BottomUpPruningBase(BottomUpPruningBase original, Cloner cloner) : base(original, cloner) { } 48 49 protected BottomUpPruningBase() { 49 Parameters.Add(new FixedValueParameter<DoubleValue>(PruningStrengthParameterName, "The strength of the pruning. Higher values force the algorithm to create simpler models", new DoubleValue( 4.0)));50 Parameters.Add(new FixedValueParameter<DoubleValue>(PruningStrengthParameterName, "The strength of the pruning. Higher values force the algorithm to create simpler models", new DoubleValue(2.0))); 50 51 } 51 52 #endregion 52 53 … … 58 59 } 59 60 #endregion 60 61 61 internal void Prune(M5TreeModel treeModel, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) { 62 var globalStdDev = m5Params.Data.GetDoubleValues(m5Params.TargetVariable, trainingRows).StandardDeviationPop(); 63 64 Prune(treeModel.Root, trainingRows, pruningRows, m5Params, new Dictionary<M5NodeModel, int>(), new Dictionary<M5NodeModel, int>(), cancellationToken, globalStdDev); 62 public void Prune(M5TreeModel treeModel, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) { 63 Prune(treeModel.Root, trainingRows, pruningRows, m5Params, new Dictionary<M5NodeModel, int>(), new Dictionary<M5NodeModel, int>(), cancellationToken); 65 64 } 66 65 67 66 private bool Prune(M5NodeModel node, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, 68 67 Dictionary<M5NodeModel, int> modelComplexities, Dictionary<M5NodeModel, int> nodeComplexities, 69 CancellationToken cancellationToken , double globalStdDev) {68 CancellationToken cancellationToken) { 70 69 //build pruning model 71 70 int numModelParams; 72 71 var pruningModel = M5StaticUtilities.BuildModel(trainingRows, m5Params, PruningLeafModel(m5Params.LeafModel), cancellationToken, out numModelParams); … … 79 78 } 80 79 81 80 //split training & pruning data 82 IReadOnlyList<int> left Test, rightTest;83 M5StaticUtilities.SplitRows(pruningRows, m5Params.Data, node.SplitAttribute, node.SplitValue, out left Test, out rightTest);81 IReadOnlyList<int> leftPruning, rightPruning; 82 M5StaticUtilities.SplitRows(pruningRows, m5Params.Data, node.SplitAttribute, node.SplitValue, out leftPruning, out rightPruning); 84 83 IReadOnlyList<int> leftTraining, rightTraining; 85 84 M5StaticUtilities.SplitRows(trainingRows, m5Params.Data, node.SplitAttribute, node.SplitValue, out leftTraining, out rightTraining); 86 85 87 86 //prune children frist 88 var lpruned = Prune(node.Left, leftTraining, left Test, m5Params, modelComplexities, nodeComplexities, cancellationToken, globalStdDev);89 var rpruned = Prune(node.Right, rightTraining, right Test, m5Params, modelComplexities, nodeComplexities, cancellationToken, globalStdDev);87 var lpruned = Prune(node.Left, leftTraining, leftPruning, m5Params, modelComplexities, nodeComplexities, cancellationToken); 88 var rpruned = Prune(node.Right, rightTraining, rightPruning, m5Params, modelComplexities, nodeComplexities, cancellationToken); 90 89 nodeComplexities.Add(node, nodeComplexities[node.Left] + nodeComplexities[node.Right] + 1); 91 90 92 91 //TODO check if this reduces quality. It reduces training effort (consideraby for some pruningTypes) … … 93 92 if (!lpruned && !rpruned) return false; 94 93 95 94 //check if pruning will happen on this node 96 if (!DecidePruneNode(node, m5Params, pruningRows, modelComplexities, nodeComplexities , globalStdDev)) return false;95 if (!DecidePruneNode(node, m5Params, pruningRows, modelComplexities, nodeComplexities)) return false; 97 96 98 97 //convert to leafNode 99 98 ((IntValue)m5Params.Results[M5TreeModel.NumCurrentLeafsResultName].Value).Value -= node.EnumerateNodes().Count(x => x.IsLeaf) - 1; 100 99 101 //TODO chack wether removal is beneficial102 100 nodeComplexities.Remove(node.Left); 103 101 nodeComplexities.Remove(node.Right); 104 102 modelComplexities.Remove(node.Left); … … 110 108 } 111 109 112 110 private bool DecidePruneNode(M5NodeModel node, M5Parameters m5Params, IReadOnlyCollection<int> testRows, 113 IReadOnlyDictionary<M5NodeModel, int> modelComplexities, IReadOnlyDictionary<M5NodeModel, int> nodeComplexities, 114 double globalStdDev) { 111 IReadOnlyDictionary<M5NodeModel, int> modelComplexities, IReadOnlyDictionary<M5NodeModel, int> nodeComplexities) { 115 112 if (testRows.Count == 0) return true; 116 113 117 114 //create regressionProblemdata from pruning data … … 123 120 124 121 //evaluate combined sub nodes and pruning model 125 122 var rmsModel = node.Model.CreateRegressionSolution(pd).TestRootMeanSquaredError; 123 rmsModel = rmsModel.IsAlmost(0.0) ? 0 : rmsModel; 126 124 var rmsSubTree = node.CreateRegressionSolution(pd).TestRootMeanSquaredError; 125 rmsSubTree = rmsSubTree.IsAlmost(0.0) ? 0 : rmsSubTree; 127 126 127 int rows = pd.Dataset.Rows; 128 node.PruningStrength = rows * (rmsModel * (rows - nodeComplexities[node]) + rmsSubTree * (modelComplexities[node] - rows)); 129 node.PruningStrength /= rmsModel * modelComplexities[node] * (nodeComplexities[node] - rows) + rmsSubTree * nodeComplexities[node] * (rows - modelComplexities[node]); 130 131 var pf1 = PruningFactor(pd.Dataset.Rows, modelComplexities[node]); 132 var pf2 = PruningFactor(pd.Dataset.Rows, nodeComplexities[node]); 128 133 //weigh, compare and decide 129 var adjustedRmsModel = rmsModel * PruningFactor(pd.Dataset.Rows, modelComplexities[node]); 130 var adjustedRmsTree = rmsSubTree * PruningFactor(pd.Dataset.Rows, nodeComplexities[node.Left] + nodeComplexities[node.Right] + 1); 131 return adjustedRmsModel <= adjustedRmsTree; 134 var adjustedRmsModel = rmsModel * pf1; 135 var adjustedRmsTree = rmsSubTree * pf2; 136 137 return adjustedRmsModel < adjustedRmsTree; 132 138 } 133 139 134 140 private double PruningFactor(int noInstances, int noParams) { 135 return noInstances <= noParams ? 10.0 : (noInstances + PruningStrength * noParams) / (noInstances - PruningStrength * noParams); 141 //in the original M5 tree a cut off is used: 142 if (noInstances <= noParams) return 10; 143 //but to have at least some punishment for additional parameters, I would prefer: */ 144 if (noInstances <= noParams) return noInstances + PruningStrength * noParams; 145 146 return (noInstances + PruningStrength * noParams) / (noInstances - noParams); 136 147 } 137 148 } 138 149 } 150 No newline at end of file -
Pruning/NoPruning.cs
20 20 #endregion 21 21 22 22 using System.Collections.Generic; 23 using System.Threading; 23 24 using HeuristicLab.Common; 24 25 using HeuristicLab.Core; 25 26 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; … … 40 41 public int MinLeafSize(IRegressionProblemData pd, ILeafModel leafModel) { 41 42 return 0; 42 43 } 44 45 public void Prune(M5TreeModel treeModel, IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) { } 43 46 #endregion 44 47 } 45 48 } 49 No newline at end of file -
Spliting/CorrelationImpuritiyCalculator.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 25 using HeuristicLab.Common; 26 27 namespace HeuristicLab.Algorithms.DataAnalysis { 28 /// <summary> 29 /// Helper class for incremental split calculation. 30 /// Used while moving a potential Spliter along the ordered training Instances 31 /// </summary> 32 internal class CorrelationImpuritiyCalculator { 33 #region state 34 //Data 35 private readonly List<double> attributeValues; 36 private readonly List<double> targetValues; 37 private readonly double order; 38 private readonly UnivariateOnlineLR left; 39 private readonly UnivariateOnlineLR right; 40 #endregion 41 42 #region Properties 43 public double Impurity { get; private set; } 44 public double SplitValue { 45 get { 46 if (left.Size <= 0) return double.NegativeInfinity; 47 if (left.Size >= attributeValues.Count) return double.PositiveInfinity; 48 return (attributeValues[left.Size - 1] + attributeValues[left.Size]) / 2; 49 } 50 } 51 public bool ValidPosition { 52 get { return !attributeValues[left.Size - 1].IsAlmost(attributeValues[left.Size]); } 53 } 54 public int LeftSize { 55 get { return left.Size; } 56 } 57 #endregion 58 59 #region Constructors 60 public CorrelationImpuritiyCalculator(int partition, IEnumerable<double> atts, IEnumerable<double> targets, double order) { 61 if (order <= 0) throw new ArgumentException("Splitter order must be larger than 0"); 62 this.order = order; 63 attributeValues = atts.ToList(); 64 targetValues = targets.ToList(); 65 left = new UnivariateOnlineLR(attributeValues.Take(partition).ToList(), targetValues.Take(partition).ToList()); 66 right = new UnivariateOnlineLR(attributeValues.Skip(partition).ToList(), targetValues.Skip(partition).ToList()); 67 UpdateImpurity(); 68 } 69 #endregion 70 71 #region IImpurityCalculator 72 public void Increment() { 73 var target = targetValues[left.Size]; 74 var att = attributeValues[left.Size]; 75 left.Add(att, target); 76 right.Remove(att, target); 77 UpdateImpurity(); 78 } 79 #endregion 80 81 private void UpdateImpurity() { 82 var yl = Math.Pow(left.Ssr, 1.0 / order); 83 var yr = Math.Pow(right.Ssr, 1.0 / order); 84 if (left.Size > 1 && right.Size > 1) Impurity = -yl - yr; 85 else Impurity = double.MinValue; 86 } 87 } 88 } 89 No newline at end of file -
Spliting/CorrelationSplitter.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 25 using HeuristicLab.Common; 26 using HeuristicLab.Core; 27 using HeuristicLab.Data; 28 using HeuristicLab.Parameters; 29 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 30 using HeuristicLab.Problems.DataAnalysis; 31 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 [StorableClass] 34 [Item("CorrelationSplitter", "An experimental split selector that uses correlation coefficients")] 35 public class CorrelationSplitter : ParameterizedNamedItem, ISplitter { 36 public const string OrderParameterName = "Order"; 37 public IFixedValueParameter<DoubleValue> OrderParameter { 38 get { return (IFixedValueParameter<DoubleValue>)Parameters[OrderParameterName]; } 39 } 40 public double Order { 41 get { return OrderParameter.Value.Value; } 42 } 43 44 #region Constructors & Cloning 45 [StorableConstructor] 46 private CorrelationSplitter(bool deserializing) { } 47 private CorrelationSplitter(CorrelationSplitter original, Cloner cloner) : base(original, cloner) { } 48 public CorrelationSplitter() { 49 Parameters.Add(new FixedValueParameter<DoubleValue>(OrderParameterName, "The exponent in the split calculation ssrLeft^(1/Order)+ssrRight^(1/Order).", new DoubleValue(1))); 50 } 51 public override IDeepCloneable Clone(Cloner cloner) { 52 return new CorrelationSplitter(this, cloner); 53 } 54 #endregion 55 56 #region ISplitType 57 public bool Split(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue) { 58 var bestSize = 0; 59 var bestImpurity = double.MinValue; 60 var bestSplitValue = 0.0; 61 var bestSplitAttr = string.Empty; 62 splitAttr = bestSplitAttr; 63 splitValue = bestSplitValue; 64 if (splitData.Dataset.Rows < minLeafSize * 2) return false; 65 66 //find best Attribute for the Splitter 67 foreach (var attr in splitData.AllowedInputVariables) { 68 int size; 69 double impurity, sValue; 70 var sortedData = splitData.Dataset.GetDoubleValues(attr).Zip(splitData.Dataset.GetDoubleValues(splitData.TargetVariable), Tuple.Create).OrderBy(x => x.Item1).ToArray(); 71 AttributeSplit(sortedData.Select(x => x.Item1).ToArray(), sortedData.Select(x => x.Item2).ToArray(), minLeafSize, out size, out impurity, out sValue); 72 if (!(bestImpurity < impurity)) continue; 73 bestImpurity = impurity; 74 bestSize = size; 75 bestSplitValue = sValue; 76 bestSplitAttr = attr; 77 } 78 79 splitAttr = bestSplitAttr; 80 splitValue = bestSplitValue; 81 82 //if no suitable split exists => leafNode 83 return bestSize >= minLeafSize && bestSize <= splitData.Dataset.Rows - minLeafSize; 84 } 85 86 private void AttributeSplit(IReadOnlyList<double> attValues, IEnumerable<double> targetValues, int minLeafSize, out int leftSize, out double maxImpurity, out double splitValue) { 87 leftSize = -1; 88 splitValue = double.MinValue; 89 maxImpurity = double.NegativeInfinity; 90 var splitValues = new List<double>(); 91 var splitSizes = new List<int>(); 92 var length = attValues.Count; 93 94 var start = minLeafSize; 95 while (attValues[start - 1].IsAlmost(attValues[start]) && start < length) 96 start++; 97 if (start >= length) return; 98 99 var imp = new CorrelationImpuritiyCalculator(minLeafSize, attValues, targetValues, Order); 100 maxImpurity = imp.Impurity; 101 splitValues.Add(imp.SplitValue); 102 splitSizes.Add(imp.LeftSize); 103 104 while (imp.LeftSize < length - minLeafSize) { 105 imp.Increment(); 106 if (!imp.ValidPosition) continue; //splits can not be made between to equal points 107 108 if (imp.Impurity.IsAlmost(maxImpurity)) { 109 splitValues.Add(imp.SplitValue); 110 splitSizes.Add(imp.LeftSize); 111 continue; 112 } 113 114 if (imp.Impurity < maxImpurity) continue; 115 splitValues.Clear(); 116 splitSizes.Clear(); 117 maxImpurity = imp.Impurity; 118 splitValues.Add(imp.SplitValue); 119 splitSizes.Add(imp.LeftSize); 120 } 121 122 var j = splitSizes.Count / 2; 123 if (splitSizes.Count == 0) return; 124 splitValue = splitValues[j]; 125 leftSize = splitSizes[j]; 126 } 127 #endregion 128 } 129 } 130 No newline at end of file -
Spliting/M5Splitter.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 25 using HeuristicLab.Common; 26 using HeuristicLab.Core; 27 using HeuristicLab.Data; 28 using HeuristicLab.Parameters; 29 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 30 using HeuristicLab.Problems.DataAnalysis; 31 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 [StorableClass] 34 [Item("M5Splitter", "A split selector that uses the ratio between Variances^(1/Order) to determine good splits")] 35 public class M5Splitter : ParameterizedNamedItem, ISplitter { 36 public const string OrderParameterName = "Order"; 37 public IFixedValueParameter<DoubleValue> OrderParameter { 38 get { return (IFixedValueParameter<DoubleValue>)Parameters[OrderParameterName]; } 39 } 40 public double Order { 41 get { return OrderParameter.Value.Value; } 42 } 43 44 #region Constructors & Cloning 45 [StorableConstructor] 46 private M5Splitter(bool deserializing) { } 47 private M5Splitter(M5Splitter original, Cloner cloner) : base(original, cloner) { } 48 public M5Splitter() { 49 Parameters.Add(new FixedValueParameter<DoubleValue>(OrderParameterName, "The exponent in the split calculation sum (x_i - x_avg)^Order.", new DoubleValue(5))); 50 } 51 public override IDeepCloneable Clone(Cloner cloner) { 52 return new M5Splitter(this, cloner); 53 } 54 #endregion 55 56 #region ISplitType 57 public bool Split(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue) { 58 var bestPos = 0; 59 var bestImpurity = double.MinValue; 60 var bestSplitValue = 0.0; 61 var bestSplitAttr = string.Empty; 62 splitAttr = bestSplitAttr; 63 splitValue = bestSplitValue; 64 if (splitData.Dataset.Rows < minLeafSize) return false; 65 //find best Attribute for the Splitter 66 foreach (var attr in splitData.AllowedInputVariables) { 67 int pos; 68 double impurity, sValue; 69 var sortedData = splitData.Dataset.GetDoubleValues(attr).Zip(splitData.Dataset.GetDoubleValues(splitData.TargetVariable), Tuple.Create).OrderBy(x => x.Item1).ToArray(); 70 AttributeSplit(sortedData.Select(x => x.Item1).ToArray(), sortedData.Select(x => x.Item2).ToArray(), minLeafSize, out pos, out impurity, out sValue); 71 if (!(bestImpurity < impurity)) continue; 72 bestImpurity = impurity; 73 bestPos = pos; 74 bestSplitValue = sValue; 75 bestSplitAttr = attr; 76 } 77 78 splitAttr = bestSplitAttr; 79 splitValue = bestSplitValue; 80 //if no suitable split exists => leafNode 81 return bestPos + 1 >= minLeafSize && bestPos <= splitData.Dataset.Rows - minLeafSize; 82 } 83 84 private void AttributeSplit(IReadOnlyList<double> attValues, IReadOnlyList<double> targetValues, int minLeafSize, out int position, out double maxImpurity, out double splitValue) { 85 position = 0; 86 maxImpurity = -1E20; 87 splitValue = 0.0; 88 var length = targetValues.Count; 89 90 91 // weka code 92 var low = 0; 93 var high = length - 1; 94 if (high - low + 1 < 4) return; 95 var len = Math.Max(minLeafSize - 1, high - low + 1 < 5 ? 1 : (high - low + 1) / 5); 96 position = low; 97 var part = low + len - 1; 98 var imp = new OrderImpurityCalculator(part + 1, targetValues, Order); 99 100 101 //if (imp.Impurity > maxImpurity && !attValues[part - 1].IsAlmost(attValues[part])) { 102 // maxImpurity = imp.Impurity; 103 // splitValue = (attValues[part - 1] + attValues[part]) / 2; 104 // position = part; 105 //} 106 107 for (var i = low + len; i < high - len; i++) { 108 imp.Increment(targetValues[i], OrderImpurityCalculator.IncrementType.Left); 109 if (attValues[i].IsAlmost(attValues[i + 1])) continue; //splits can not be made between to equal points 110 if (imp.Impurity < maxImpurity) continue; 111 maxImpurity = imp.Impurity; 112 splitValue = (attValues[i] + attValues[i + 1]) / 2; 113 position = i; 114 } 115 } 116 #endregion 117 } 118 } 119 No newline at end of file -
Spliting/NeumaierSum.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 25 using System.Runtime.CompilerServices; 26 using HeuristicLab.Common; 27 28 namespace HeuristicLab.Algorithms.DataAnalysis { 29 /// <summary> 30 /// Helper class for incremental split calculation. 31 /// Used while moving a potential Splitter along the ordered training Instances 32 /// </summary> 33 internal class NeumaierSum { 34 #region state 35 private double sum; 36 private double correction; 37 #endregion 38 39 #region Constructors 40 public NeumaierSum(double startvalue) { 41 sum = startvalue; 42 correction = 0; 43 } 44 #endregion 45 46 [MethodImpl(MethodImplOptions.NoOptimization)] 47 public void Add(double value) { 48 var t = sum + value; 49 var absSum = sum > 0 ? sum : -sum; 50 var absv = value > 0 ? value : -value; 51 if (absSum >= absv) 52 correction += (sum - t) + value; 53 else 54 correction += (value - t) + sum; 55 sum = t; 56 } 57 58 public double Get() { 59 return sum + correction; 60 } 61 62 public void Mul(double value) { 63 sum *= value; 64 correction *= value; 65 } 66 } 67 } 68 No newline at end of file -
Spliting/OptimumSearchingSplitter.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 25 using HeuristicLab.Common; 26 using HeuristicLab.Core; 27 using HeuristicLab.Data; 28 using HeuristicLab.Parameters; 29 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 30 using HeuristicLab.Problems.DataAnalysis; 31 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 [StorableClass] 34 [Item("OptimumSearchingSplitter", "A split selector that favours higher resolution splits near percieved optima.\n Decribed in \"Model-Based Genetic Algorithms for Algorithm Configuration\" by Carlos Ansotegui et al ")] 35 public class OptimumSearchingSplitter : ParameterizedNamedItem, ISplitter { 36 public const string SearchStrengthParameterName = "Search Strength"; 37 public const string MaximizationParamterName = "Maximization"; 38 public const string OrderParameterName = "Order"; 39 public IFixedValueParameter<DoubleValue> OrderParameter { 40 get { return (IFixedValueParameter<DoubleValue>)Parameters[OrderParameterName]; } 41 } 42 public IFixedValueParameter<PercentValue> SearchStrengthParameter { 43 get { return (IFixedValueParameter<PercentValue>)Parameters[SearchStrengthParameterName]; } 44 } 45 public IFixedValueParameter<BoolValue> MaximizationParameter { 46 get { return (IFixedValueParameter<BoolValue>)Parameters[MaximizationParamterName]; } 47 } 48 public double Order { 49 get { return OrderParameter.Value.Value; } 50 } 51 public double SearchStrength { 52 get { return SearchStrengthParameter.Value.Value; } 53 } 54 public bool Maximization { 55 get { return MaximizationParameter.Value.Value; } 56 } 57 58 #region Constructors & Cloning 59 [StorableConstructor] 60 private OptimumSearchingSplitter(bool deserializing) { } 61 private OptimumSearchingSplitter(OptimumSearchingSplitter original, Cloner cloner) : base(original, cloner) { } 62 public OptimumSearchingSplitter() { 63 Parameters.Add(new FixedValueParameter<DoubleValue>(OrderParameterName, "The exponent in the split calculation sum (x_i - x_avg)^Order.", new DoubleValue(2))); 64 Parameters.Add(new FixedValueParameter<PercentValue>(SearchStrengthParameterName, "How strong the spliting process should be skewed towards/away from the percieved optimum", new PercentValue(0.10))); 65 Parameters.Add(new FixedValueParameter<BoolValue>(MaximizationParamterName, "Whether the splitting procedure should asume a minimization or maximization procedure.")); 66 } 67 public override IDeepCloneable Clone(Cloner cloner) { 68 return new OptimumSearchingSplitter(this, cloner); 69 } 70 #endregion 71 72 #region ISplitType 73 public bool Split(IRegressionProblemData splitData, int minLeafSize, out string splitAttr, out double splitValue) { 74 var bestImpurity = double.MinValue; 75 var bestSplitValue = 0.0; 76 var bestSplitAttr = string.Empty; 77 var bestSize = 0; 78 splitAttr = bestSplitAttr; 79 splitValue = bestSplitValue; 80 81 var targets = splitData.TargetVariableValues.ToArray(); 82 var vh = targets.Quantile(Maximization ? 1 - SearchStrength : SearchStrength); 83 var lower = new HashSet<int>(targets.Select((x, i) => new {x, i}).Where(e => e.x < vh).Select(e => e.i)); 84 85 if (splitData.Dataset.Rows < minLeafSize) return false; 86 87 foreach (var attr in splitData.AllowedInputVariables) { 88 int pos; 89 double impurity, sValue; 90 AttributeSplit(splitData, lower, attr, vh, minLeafSize, out pos, out impurity, out sValue); 91 if (!(bestImpurity < impurity)) continue; 92 bestImpurity = impurity; 93 bestSplitValue = sValue; 94 bestSplitAttr = attr; 95 bestSize = pos; 96 } 97 98 splitAttr = bestSplitAttr; 99 splitValue = bestSplitValue; 100 101 //if no suitable split exists => leafNode 102 return bestSize >= minLeafSize && bestSize <= splitData.Dataset.Rows - minLeafSize; 103 } 104 105 private void AttributeSplit(IRegressionProblemData splitData, ICollection<int> t, string attribute, double vh, int minLeafSize, out int leftSize, out double maxImpurity, out double splitValue) { 106 leftSize = 0; 107 maxImpurity = -1E20; 108 splitValue = 0.0; 109 var length = splitData.Dataset.Rows; 110 111 double lls = 0, rls = 0, lts = 0, rts = 0; 112 int ltn = 0, rtn = t.Count; 113 114 var points = splitData.Dataset.GetDoubleValues(attribute).Select((x, i) => new {x, i}).OrderBy(e => e.x).ToArray(); 115 for (var i = 0; i < length - minLeafSize; i++) { 116 var point = points[i]; 117 var con = Contibution(splitData, vh, point.i); 118 //move contribution to and from respcetive sums 119 if (t.Contains(i)) { 120 lls += con; 121 rls -= con; 122 } 123 else { 124 ltn++; 125 rtn--; 126 lts += con; 127 rts -= con; 128 } 129 130 //splits can not be made between to equal points 131 if (point.x.IsAlmost(points[i + 1].x)) continue; 132 133 //calculate impurity / score 134 var al = (ltn + lts) / (1 + lls); 135 var ar = (rtn + rts) / (1 + rls); 136 var impurity = ltn > rtn ? al : ltn < rtn ? ar : Math.Min(al, ar); 137 138 if (i < minLeafSize || impurity < maxImpurity) continue; 139 maxImpurity = impurity; 140 splitValue = (point.x + points[i + 1].x) / 2; 141 leftSize = i + 1; 142 } 143 } 144 145 private double Contibution(IRegressionProblemData splitData, double vh, int i) { 146 var v = splitData.Dataset.GetDoubleValue(splitData.TargetVariable, i) - vh; 147 return Math.Pow(v, Order); 148 } 149 #endregion 150 } 151 } 152 No newline at end of file -
Spliting/OrderImpurityCalculator.cs
27 27 namespace HeuristicLab.Algorithms.DataAnalysis { 28 28 /// <summary> 29 29 /// Helper class for incremental split calculation. 30 /// Used while moving a potential Split er along the ordered training Instances30 /// Used while moving a potential Splitter along the ordered training Instances 31 31 /// </summary> 32 32 internal class OrderImpurityCalculator { 33 33 internal enum IncrementType { … … 104 104 VarLeft = NoLeft <= 0 ? 0 : Math.Abs(NoLeft * SqSumLeft - SumLeft * SumLeft) / (NoLeft * NoLeft); 105 105 VarRight = NoRight <= 0 ? 0 : Math.Abs(NoRight * SqSumRight - SumRight * SumRight) / (NoRight * NoRight); 106 106 107 if (Order <= 0) throw new ArgumentException("Split er order must be larger than 0");107 if (Order <= 0) throw new ArgumentException("Splitter order must be larger than 0"); 108 108 if (Order.IsAlmost(1)) { 109 109 y = VarTotal; 110 110 yl = VarLeft; … … 115 115 yl = Math.Pow(VarLeft, 1.0 / Order); 116 116 yr = Math.Pow(VarRight, 1.0 / Order); 117 117 } 118 var t = NoRight + NoLeft; 119 if (NoLeft <= 0.0 || NoRight <= 0.0) Impurity = double.MinValue; //Spliter = 0; 120 else Impurity = y - NoLeft / t * yl - NoRight / t * yr; // Spliter = y - NoLeft / NoRight * yl - NoRight / NoLeft * yr 118 if (NoLeft <= 0.0 || NoRight <= 0.0) Impurity = double.MinValue; //Splitter = 0; 119 else Impurity = y - (NoLeft * yl + NoRight * yr) / (NoRight + NoLeft); 121 120 } 122 121 #endregion 123 122 } -
Spliting/UnivariateOnlineLR.cs
1 #region License Information 2 /* HeuristicLab 3 * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 * 5 * This file is part of HeuristicLab. 6 * 7 * HeuristicLab is free software: you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation, either version 3 of the License, or 10 * (at your option) any later version. 11 * 12 * HeuristicLab is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>. 19 */ 20 #endregion 21 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 25 using HeuristicLab.Common; 26 27 namespace HeuristicLab.Algorithms.DataAnalysis { 28 /// <summary> 29 /// Helper class for incremental split calculation. 30 /// Used while moving a potential Spliter along the ordered training Instances 31 /// </summary> 32 internal class UnivariateOnlineLR { 33 #region state 34 private readonly NeumaierSum targetMean; 35 private readonly NeumaierSum attributeMean; 36 private readonly NeumaierSum targetVarSum; 37 private readonly NeumaierSum attributeVarSum; 38 private readonly NeumaierSum comoment; 39 private readonly NeumaierSum ssr; 40 private int size; 41 #endregion 42 43 public double Ssr { 44 get { return ssr.Get(); } 45 } 46 public int Size { 47 get { return size; } 48 } 49 50 private double Beta { 51 get { return comoment.Get() / attributeVarSum.Get(); } 52 } 53 54 private double Alpha { 55 get { return targetMean.Get() - Beta * attributeMean.Get(); } 56 } 57 58 public UnivariateOnlineLR(ICollection<double> attributeValues, ICollection<double> targetValues) { 59 if (attributeValues.Count != targetValues.Count) throw new ArgumentException("Targets and Attributes need to have the same length"); 60 size = attributeValues.Count; 61 62 var yMean = targetValues.Average(); 63 var xMean = attributeValues.Average(); 64 targetMean = new NeumaierSum(yMean); 65 attributeMean = new NeumaierSum(xMean); 66 targetVarSum = new NeumaierSum(targetValues.VariancePop() * size); 67 attributeVarSum = new NeumaierSum(attributeValues.VariancePop() * size); 68 comoment = new NeumaierSum(attributeValues.Zip(targetValues, (x, y) => (x - xMean) * (y - yMean)).Sum()); 69 70 var beta = comoment.Get() / attributeVarSum.Get(); 71 var alpha = yMean - beta * xMean; 72 ssr = new NeumaierSum(attributeValues.Zip(targetValues, (x, y) => y - alpha - beta * x).Sum(x => x * x)); 73 } 74 75 public void Add(double attributeValue, double targetValue) { 76 var predictOld = Predict(attributeValue, targetValue); 77 78 size++; 79 var dx = attributeValue - attributeMean.Get(); 80 var dy = targetValue - targetMean.Get(); 81 attributeMean.Add(dx / size); 82 targetMean.Add(dy / size); 83 var dx2 = attributeValue - attributeMean.Get(); 84 var dy2 = targetValue - targetMean.Get(); 85 attributeVarSum.Add(dx * dx2); 86 targetVarSum.Add(dy * dy2); 87 comoment.Add(dx * dy2); 88 89 ssr.Add(predictOld * Predict(attributeValue, targetValue)); 90 } 91 92 public void Remove(double attributeValue, double targetValue) { 93 var predictOld = Predict(attributeValue, targetValue); 94 95 var dx2 = attributeValue - attributeMean.Get(); 96 var dy2 = targetValue - targetMean.Get(); 97 attributeMean.Mul(size / (size - 1.0)); 98 targetMean.Mul(size / (size - 1.0)); 99 attributeMean.Add(-attributeValue / (size - 1.0)); 100 targetMean.Add(-targetValue / (size - 1.0)); 101 var dx = attributeValue - attributeMean.Get(); 102 var dy = targetValue - targetMean.Get(); 103 attributeVarSum.Add(-dx * dx2); 104 targetVarSum.Add(-dy * dy2); 105 comoment.Add(-dx * dy2); 106 size--; 107 108 ssr.Add(-predictOld * Predict(attributeValue, targetValue)); 109 } 110 111 private double Predict(double attributeValue, double targetValue) { 112 return targetValue - Alpha - Beta * attributeValue; 113 } 114 } 115 } 116 No newline at end of file
Note: See TracBrowser
for help on using the repository browser.