#region License Information /* HeuristicLab * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using System.Threading; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Problems.DataAnalysis; using HEAL.Attic; namespace HeuristicLab.Algorithms.DataAnalysis { [StorableType("58517042-5318-4087-B098-AC75F0208BA0")] [Item("M5Leaf", "A leaf type that uses regularized linear models with feature selection as leaf models.")] public class M5Leaf : LeafBase { #region Constructors & Cloning [StorableConstructor] private M5Leaf(StorableConstructorFlag _) : base(_) { } private M5Leaf(M5Leaf original, Cloner cloner) : base(original, cloner) { } public M5Leaf() { } public override IDeepCloneable Clone(Cloner cloner) { return new M5Leaf(this, cloner); } #endregion #region IModelType public override bool ProvidesConfidence { get { return false; } } public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters) { if (pd.Dataset.Rows == 0) throw new ArgumentException("The number of training instances is too small to create an M5 leaf model"); if (pd.Dataset.Rows == 1) return new ConstantLeaf().Build(pd, random, cancellationToken, out numberOfParameters); var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Average()); var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Variance()); var used = pd.AllowedInputVariables.Where(v => !variances[v].IsAlmost(0.0)).ToList(); var targetMean = pd.TargetVariableTrainingValues.Average(); var targetVariance = pd.TargetVariableTrainingValues.Variance(); var model = FindBestModel(variances, means, targetMean, targetVariance, pd, used); numberOfParameters = 1 + model.Coefficients.Count; return model; } public override int MinLeafSize(IRegressionProblemData pd) { return 1; } #endregion private static PreconstructedLinearModel FindBestModel(Dictionary variances, Dictionary means, double yMean, double yVariance, IRegressionProblemData pd, IList variables) { Dictionary coeffs; double intercept; do { coeffs = DoRegression(pd, variables, variances, means, yMean, 1.0e-8, out intercept); variables = DeselectColinear(variances, coeffs, yVariance, pd, variables); } while (coeffs.Count != variables.Count); var numAtts = variables.Count; var numInst = pd.TrainingIndices.Count(); var fullMse = CalculateSE(coeffs, intercept, pd, variables); var akaike = 1.0 * (numInst - numAtts) + 2 * numAtts; var improved = true; var currentNumAttributes = numAtts; while (improved && currentNumAttributes > 1) { improved = false; currentNumAttributes--; // Find attribute with smallest SC (variance-scaled coefficient) var candidate = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance))) .OrderBy(x => x.Value).Select(x => x.Key).First(); var currVariables = variables.Where(v => !v.Equals(candidate)).ToList(); var currentIntercept = 0.0; var currentCoeffs = DoRegression(pd, currVariables, variances, means, yMean, 1.0e-8, out currentIntercept); var currentMse = CalculateSE(currentCoeffs, currentIntercept, pd, currVariables); var currentAkaike = currentMse / fullMse * (numInst - numAtts) + 2 * currentNumAttributes; if (!(currentAkaike < akaike)) continue; improved = true; akaike = currentAkaike; coeffs = currentCoeffs; intercept = currentIntercept; variables = currVariables; } var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable); pd2.TestPartition.End = pd.TestPartition.End; pd2.TestPartition.Start = pd.TestPartition.Start; pd2.TrainingPartition.End = pd.TrainingPartition.End; pd2.TrainingPartition.Start = pd.TrainingPartition.Start; return new PreconstructedLinearModel(coeffs, intercept, pd.TargetVariable); } private static Dictionary DoRegression(IRegressionProblemData pd, IList variables, Dictionary variances, Dictionary means, double yMean, double ridge, out double intercept) { var n = variables.Count; var m = pd.TrainingIndices.Count(); var inTr = new double[n, m]; for (var i = 0; i < n; i++) { var v = variables[i]; var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray(); var sd = Math.Sqrt(variances[v]); var mean = means[v]; for (var j = 0; j < m; j++) { inTr[i, j] = (vdata[j] - mean) / sd; } } var y = new double[m, 1]; var ydata = pd.TargetVariableTrainingValues.ToArray(); for (var i = 0; i < m; i++) y[i, 0] = ydata[i]; //no scaling for targets; var aTy = new double[n, 1]; alglib.rmatrixgemm(n, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y; var aTa = new double[n, n]; alglib.rmatrixgemm(n, n, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa // var aTaDecomp = new double[n, n]; bool success; var tries = 0; double[] coefficients = null; do { for (var i = 0; i < n; i++) aTa[i, i] += ridge; // add ridge to diagonal to enforce singularity try { //solve "aTa * coefficients = aTy" for coefficients; Array.Copy(aTa, 0, aTaDecomp, 0, aTa.Length); alglib.spdmatrixcholesky(ref aTaDecomp, n, true); int info; alglib.densesolverreport report; alglib.spdmatrixcholeskysolve(aTaDecomp, n, true, ydata, out info, out report, out coefficients); if (info != 1) throw new Exception(); success = true; } catch (Exception) { for (var i = 0; i < n; i++) aTa[i, i] -= ridge; ridge *= 10; // increase ridge; success = false; } finally { tries++; } } while (!success && tries < 100); if (coefficients == null) throw new ArgumentException("No linear model could be built"); intercept = yMean; var res = new Dictionary(); for (var i = 0; i < n; i++) { var v = variables[i]; res.Add(v, coefficients[i] /= Math.Sqrt(variances[v])); intercept -= coefficients[i] * means[v]; } return res; } private static IList DeselectColinear(Dictionary variances, Dictionary coeffs, double yVariance, IRegressionProblemData pd, IList variables) { var candidates = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance))).Where(x => x.Value > 1.5).OrderBy(x => -x.Value).ToList(); if (candidates.Count == 0) return variables; var c = candidates.First().Key; return variables.Where(v => !v.Equals(c)).ToList(); } private static double CalculateSE(Dictionary coefficients, double intercept, IRegressionProblemData pd, IList variables) { return pd.TrainingIndices.Select(i => RegressionPrediction(i, pd, variables, coefficients, intercept) - pd.Dataset.GetDoubleValue(pd.TargetVariable, i)).Select(error => error * error).Sum(); } private static double RegressionPrediction(int i, IRegressionProblemData pd, IList variables, Dictionary coefficients, double intercept) { return intercept + variables.Sum(v => pd.Dataset.GetDoubleValue(v, i) * coefficients[v]); } } }