1  #region License Information


2  /* HeuristicLab


3  * Copyright (C) 20022017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)


4  *


5  * This file is part of HeuristicLab.


6  *


7  * HeuristicLab is free software: you can redistribute it and/or modify


8  * it under the terms of the GNU General Public License as published by


9  * the Free Software Foundation, either version 3 of the License, or


10  * (at your option) any later version.


11  *


12  * HeuristicLab is distributed in the hope that it will be useful,


13  * but WITHOUT ANY WARRANTY; without even the implied warranty of


14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the


15  * GNU General Public License for more details.


16  *


17  * You should have received a copy of the GNU General Public License


18  * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.


19  */


20  #endregion


21 


22  using System;


23  using System.Collections.Generic;


24  using System.Linq;


25  using System.Threading;


26  using HeuristicLab.Common;


27  using HeuristicLab.Core;


28  using HeuristicLab.Problems.DataAnalysis;


29  using HEAL.Attic;


30 


31  namespace HeuristicLab.Algorithms.DataAnalysis {


32  [StorableType("5851704253184087B098AC75F0208BA0")]


33  [Item("M5Leaf", "A leaf type that uses regularized linear models with feature selection as leaf models.")]


34  public class M5Leaf : LeafBase {


35  #region Constructors & Cloning


36  [StorableConstructor]


37  private M5Leaf(StorableConstructorFlag _) : base(_) { }


38  private M5Leaf(M5Leaf original, Cloner cloner) : base(original, cloner) { }


39  public M5Leaf() { }


40  public override IDeepCloneable Clone(Cloner cloner) {


41  return new M5Leaf(this, cloner);


42  }


43  #endregion


44 


45  #region IModelType


46  public override bool ProvidesConfidence {


47  get { return false; }


48  }


49  public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters) {


50  if (pd.Dataset.Rows == 0) throw new ArgumentException("The number of training instances is too small to create an M5 leaf model");


51 


52  if (pd.Dataset.Rows == 1)


53  return new ConstantLeaf().Build(pd, random, cancellationToken, out numberOfParameters);


54 


55  var means = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Average());


56  var variances = pd.AllowedInputVariables.ToDictionary(n => n, n => pd.Dataset.GetDoubleValues(n, pd.TrainingIndices).Variance());


57  var used = pd.AllowedInputVariables.Where(v => !variances[v].IsAlmost(0.0)).ToList();


58 


59  var targetMean = pd.TargetVariableTrainingValues.Average();


60  var targetVariance = pd.TargetVariableTrainingValues.Variance();


61 


62  var model = FindBestModel(variances, means, targetMean, targetVariance, pd, used);


63  numberOfParameters = 1 + model.Coefficients.Count;


64  return model;


65  }


66  public override int MinLeafSize(IRegressionProblemData pd) {


67  return 1;


68  }


69  #endregion


70 


71  private static PreconstructedLinearModel FindBestModel(Dictionary<string, double> variances, Dictionary<string, double> means, double yMean, double yVariance, IRegressionProblemData pd, IList<string> variables) {


72  Dictionary<string, double> coeffs;


73  double intercept;


74  do {


75  coeffs = DoRegression(pd, variables, variances, means, yMean, 1.0e8, out intercept);


76  variables = DeselectColinear(variances, coeffs, yVariance, pd, variables);


77  }


78  while (coeffs.Count != variables.Count);


79  var numAtts = variables.Count;


80  var numInst = pd.TrainingIndices.Count();


81  var fullMse = CalculateSE(coeffs, intercept, pd, variables);


82  var akaike = 1.0 * (numInst  numAtts) + 2 * numAtts;


83 


84  var improved = true;


85  var currentNumAttributes = numAtts;


86 


87  while (improved && currentNumAttributes > 1) {


88  improved = false;


89  currentNumAttributes;


90  // Find attribute with smallest SC (variancescaled coefficient)


91  var candidate = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance)))


92  .OrderBy(x => x.Value).Select(x => x.Key).First();


93 


94  var currVariables = variables.Where(v => !v.Equals(candidate)).ToList();


95  var currentIntercept = 0.0;


96  var currentCoeffs = DoRegression(pd, currVariables, variances, means, yMean, 1.0e8, out currentIntercept);


97  var currentMse = CalculateSE(currentCoeffs, currentIntercept, pd, currVariables);


98  var currentAkaike = currentMse / fullMse * (numInst  numAtts) + 2 * currentNumAttributes;


99 


100  if (!(currentAkaike < akaike)) continue;


101  improved = true;


102  akaike = currentAkaike;


103  coeffs = currentCoeffs;


104  intercept = currentIntercept;


105  variables = currVariables;


106  }


107 


108  var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable);


109  pd2.TestPartition.End = pd.TestPartition.End;


110  pd2.TestPartition.Start = pd.TestPartition.Start;


111  pd2.TrainingPartition.End = pd.TrainingPartition.End;


112  pd2.TrainingPartition.Start = pd.TrainingPartition.Start;


113 


114  return new PreconstructedLinearModel(coeffs, intercept, pd.TargetVariable);


115  }


116 


117  private static Dictionary<string, double> DoRegression(IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> variances, Dictionary<string, double> means, double yMean, double ridge, out double intercept) {


118 


119  var n = variables.Count;


120  var m = pd.TrainingIndices.Count();


121 


122  var inTr = new double[n, m];


123  for (var i = 0; i < n; i++) {


124  var v = variables[i];


125  var vdata = pd.Dataset.GetDoubleValues(v, pd.TrainingIndices).ToArray();


126  var sd = Math.Sqrt(variances[v]);


127  var mean = means[v];


128  for (var j = 0; j < m; j++) {


129  inTr[i, j] = (vdata[j]  mean) / sd;


130  }


131  }


132 


133  var y = new double[m, 1];


134  var ydata = pd.TargetVariableTrainingValues.ToArray();


135  for (var i = 0; i < m; i++)


136  y[i, 0] = ydata[i]; //no scaling for targets;


137 


138 


139  var aTy = new double[n, 1];


140  alglib.rmatrixgemm(n, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y;


141  var aTa = new double[n, n];


142  alglib.rmatrixgemm(n, n, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa //


143 


144  var aTaDecomp = new double[n, n];


145  bool success;


146  var tries = 0;


147  double[] coefficients = null;


148  do {


149  for (var i = 0; i < n; i++) aTa[i, i] += ridge; // add ridge to diagonal to enforce singularity


150  try {


151  //solve "aTa * coefficients = aTy" for coefficients;


152  Array.Copy(aTa, 0, aTaDecomp, 0, aTa.Length);


153  alglib.spdmatrixcholesky(ref aTaDecomp, n, true);


154  int info;


155  alglib.densesolverreport report;


156  alglib.spdmatrixcholeskysolve(aTaDecomp, n, true, ydata, out info, out report, out coefficients);


157 


158  if (info != 1) throw new Exception();


159  success = true;


160  }


161  catch (Exception) {


162  for (var i = 0; i < n; i++) aTa[i, i] = ridge;


163  ridge *= 10; // increase ridge;


164  success = false;


165  }


166  finally {


167  tries++;


168  }


169  }


170  while (!success && tries < 100);


171  if (coefficients == null) throw new ArgumentException("No linear model could be built");


172 


173  intercept = yMean;


174  var res = new Dictionary<string, double>();


175  for (var i = 0; i < n; i++) {


176  var v = variables[i];


177  res.Add(v, coefficients[i] /= Math.Sqrt(variances[v]));


178  intercept = coefficients[i] * means[v];


179  }


180 


181  return res;


182  }


183 


184  private static IList<string> DeselectColinear(Dictionary<string, double> variances, Dictionary<string, double> coeffs, double yVariance, IRegressionProblemData pd, IList<string> variables) {


185  var candidates = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance))).Where(x => x.Value > 1.5).OrderBy(x => x.Value).ToList();


186  if (candidates.Count == 0) return variables;


187  var c = candidates.First().Key;


188  return variables.Where(v => !v.Equals(c)).ToList();


189  }


190 


191  private static double CalculateSE(Dictionary<string, double> coefficients, double intercept, IRegressionProblemData pd, IList<string> variables) {


192  return pd.TrainingIndices.Select(i => RegressionPrediction(i, pd, variables, coefficients, intercept)  pd.Dataset.GetDoubleValue(pd.TargetVariable, i)).Select(error => error * error).Sum();


193  }


194  private static double RegressionPrediction(int i, IRegressionProblemData pd, IList<string> variables, Dictionary<string, double> coefficients, double intercept) {


195  return intercept + variables.Sum(v => pd.Dataset.GetDoubleValue(v, i) * coefficients[v]);


196  }


197  }


198  } 
