#region License Information /* HeuristicLab * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Linq; using HeuristicLab.Analysis; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; using HeuristicLab.Optimization; using HeuristicLab.Parameters; using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; using HeuristicLab.Problems.DataAnalysis; using HeuristicLab.Problems.DataAnalysis.Symbolic; using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression; namespace HeuristicLab.Algorithms.DataAnalysis.Glmnet { [Item("Elastic-net Linear Regression (LR)", "Linear regression with elastic-net regularization (wrapper for glmnet)")] [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 110)] [StorableClass] public sealed class ElasticNetLinearRegression : FixedDataAnalysisAlgorithm { private const string PenalityParameterName = "Penality"; private const string LogLambdaParameterName = "Log10(Lambda)"; #region parameters public IFixedValueParameter PenalityParameter { get { return (IFixedValueParameter)Parameters[PenalityParameterName]; } } public IValueParameter LogLambdaParameter { get { return (IValueParameter)Parameters[LogLambdaParameterName]; } } #endregion #region properties public double Penality { get { return PenalityParameter.Value.Value; } set { PenalityParameter.Value.Value = value; } } public DoubleValue LogLambda { get { return LogLambdaParameter.Value; } set { LogLambdaParameter.Value = value; } } #endregion [StorableConstructor] private ElasticNetLinearRegression(bool deserializing) : base(deserializing) { } private ElasticNetLinearRegression(ElasticNetLinearRegression original, Cloner cloner) : base(original, cloner) { } public ElasticNetLinearRegression() : base() { Problem = new RegressionProblem(); Parameters.Add(new FixedValueParameter(PenalityParameterName, "Penalty factor for balancing between ridge (0.0) and lasso (1.0) regression", new DoubleValue(0.5))); Parameters.Add(new OptionalValueParameter(LogLambdaParameterName, "Optional: the value of lambda for which to calculate an elastic-net solution. lambda == null => calculate the whole path of all lambdas")); } [StorableHook(HookType.AfterDeserialization)] private void AfterDeserialization() { } public override IDeepCloneable Clone(Cloner cloner) { return new ElasticNetLinearRegression(this, cloner); } protected override void Run() { if (LogLambda == null) { CreateSolutionPath(); } else { CreateSolution(LogLambda.Value); } } private void CreateSolution(double logLambda) { double trainNMSE; double testNMSE; var coeff = CreateElasticNetLinearRegressionSolution(Problem.ProblemData, Penality, Math.Pow(10, logLambda), out trainNMSE, out testNMSE); Results.Add(new Result("NMSE (train)", new DoubleValue(trainNMSE))); Results.Add(new Result("NMSE (test)", new DoubleValue(testNMSE))); // copied from LR => TODO: reuse code (but skip coefficients = 0.0) ISymbolicExpressionTree tree = new SymbolicExpressionTree(new ProgramRootSymbol().CreateTreeNode()); ISymbolicExpressionTreeNode startNode = new StartSymbol().CreateTreeNode(); tree.Root.AddSubtree(startNode); ISymbolicExpressionTreeNode addition = new Addition().CreateTreeNode(); startNode.AddSubtree(addition); int col = 0; foreach (string column in Problem.ProblemData.AllowedInputVariables) { if (!coeff[col].IsAlmost(0.0)) { VariableTreeNode vNode = (VariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode(); vNode.VariableName = column; vNode.Weight = coeff[col]; addition.AddSubtree(vNode); } col++; } if (!coeff[coeff.Length - 1].IsAlmost(0.0)) { ConstantTreeNode cNode = (ConstantTreeNode)new Constant().CreateTreeNode(); cNode.Value = coeff[coeff.Length - 1]; addition.AddSubtree(cNode); } SymbolicRegressionSolution solution = new SymbolicRegressionSolution( new SymbolicRegressionModel(Problem.ProblemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()), (IRegressionProblemData)Problem.ProblemData.Clone()); solution.Model.Name = "Elastic-net Linear Regression Model"; solution.Name = "Elastic-net Linear Regression Solution"; Results.Add(new Result(solution.Name, solution.Description, solution)); } private void CreateSolutionPath() { double[] lambda; double[] trainNMSE; double[] testNMSE; double[,] coeff; double[] intercept; RunElasticNetLinearRegression(Problem.ProblemData, Penality, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept); var coeffTable = new IndexedDataTable("Coefficients", "The paths of standarized coefficient values over different lambda values"); coeffTable.VisualProperties.YAxisMaximumAuto = false; coeffTable.VisualProperties.YAxisMinimumAuto = false; coeffTable.VisualProperties.XAxisMaximumAuto = false; coeffTable.VisualProperties.XAxisMinimumAuto = false; coeffTable.VisualProperties.XAxisLogScale = true; coeffTable.VisualProperties.XAxisTitle = "Log10(Lambda)"; coeffTable.VisualProperties.YAxisTitle = "Coefficients"; coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables"; var nLambdas = lambda.Length; var nCoeff = coeff.GetLength(1); var dataRows = new IndexedDataRow[nCoeff]; var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray(); var numNonZeroCoeffs = new int[nLambdas]; for (int i = 0; i < nCoeff; i++) { var coeffId = allowedVars[i]; double sigma = Problem.ProblemData.Dataset.GetDoubleValues(coeffId).StandardDeviation(); var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray(); dataRows[i] = new IndexedDataRow(coeffId, coeffId, path); coeffTable.Rows.Add(dataRows[i]); } for (int i = 0; i < coeff.GetLength(0); i++) { for (int j = 0; j < coeff.GetLength(1); j++) { if (!coeff[i, j].IsAlmost(0.0)) { numNonZeroCoeffs[i]++; } } } if (lambda.Length > 2) { coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last()))); coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First()))); } coeffTable.Rows.Add(new IndexedDataRow("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v)))); coeffTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points; coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true; Results.Add(new Result(coeffTable.Name, coeffTable.Description, coeffTable)); var errorTable = new IndexedDataTable("NMSE", "Path of NMSE values over different lambda values"); errorTable.VisualProperties.YAxisMaximumAuto = false; errorTable.VisualProperties.YAxisMinimumAuto = false; errorTable.VisualProperties.XAxisMaximumAuto = false; errorTable.VisualProperties.XAxisMinimumAuto = false; errorTable.VisualProperties.YAxisMinimumFixedValue = 0; errorTable.VisualProperties.YAxisMaximumFixedValue = 1.0; errorTable.VisualProperties.XAxisLogScale = true; errorTable.VisualProperties.XAxisTitle = "Log10(Lambda)"; errorTable.VisualProperties.YAxisTitle = "Normalized mean of squared errors (NMSE)"; errorTable.VisualProperties.SecondYAxisTitle= "Number of variables"; errorTable.Rows.Add(new IndexedDataRow("NMSE (train)", "Path of NMSE values over different lambda values", lambda.Zip(trainNMSE, (l, v) => Tuple.Create(l, v)))); errorTable.Rows.Add(new IndexedDataRow("NMSE (test)", "Path of NMSE values over different lambda values", lambda.Zip(testNMSE, (l, v) => Tuple.Create(l, v)))); errorTable.Rows.Add(new IndexedDataRow("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v)))); if (lambda.Length > 2) { errorTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last()))); errorTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First()))); } errorTable.Rows["NMSE (train)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points; errorTable.Rows["NMSE (test)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points; errorTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points; errorTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true; Results.Add(new Result(errorTable.Name, errorTable.Description, errorTable)); } public static double[] CreateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double lambda, out double trainNMSE, out double testNMSE, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) { double[] trainNMSEs; double[] testNMSEs; // run for exactly one lambda var coeffs = CreateElasticNetLinearRegressionSolution(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound); trainNMSE = trainNMSEs[0]; testNMSE = testNMSEs[0]; return coeffs[0]; } public static double[][] CreateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double[] lambda, out double[] trainNMSEs, out double[] testNMSEs, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1) { // run for multiple user-supplied lambdas double[,] coeff; double[] intercept; RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainNMSEs, out testNMSEs, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars); int nRows = intercept.Length; int nCols = coeff.GetLength(1) + 1; double[][] sols = new double[nRows][]; for (int solIdx = 0; solIdx < nRows; solIdx++) { sols[solIdx] = new double[nCols]; for (int cIdx = 0; cIdx < nCols - 1; cIdx++) { sols[solIdx][cIdx] = coeff[solIdx, cIdx]; } sols[solIdx][nCols - 1] = intercept[solIdx]; } return sols; } public static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1 ) { double[] userLambda = new double[0]; // automatically determine lambda values (maximum 100 different lambda values) RunElasticNetLinearRegression(problemData, penalty, 100, 0.0, userLambda, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars); } /// /// Elastic net with squared-error-loss for dense predictor matrix, runs the full path of all lambdas /// /// Predictor target matrix x and target vector y /// Penalty for balance between ridge (0.0) and lasso (1.0) regression /// Maximum number of lambda values (default 100) /// User control of lambda values (<1.0 => minimum lambda = flmin * (largest lambda value), >= 1.0 => use supplied lambda values /// User supplied lambda values /// Output lambda values /// Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the training set for each set of coefficients along the path /// Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the test set for each set of coefficients along the path /// Vector of coefficient vectors for each solution along the path /// Vector of intercepts for each solution along the path /// Optional lower bound for all coefficients /// Optional upper bound for all coefficients /// Maximum allowed number of variables in each solution along the path (-1 => all variables are allowed) private static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty, int nlam, double flmin, double[] ulam, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1 ) { if (penalty < 0.0 || penalty > 1.0) throw new ArgumentException("0 <= penalty <= 1", "penalty"); double[,] trainX; double[,] testX; double[] trainY; double[] testY; int numTrainObs, numTestObs; int numVars; PrepareData(problemData, out trainX, out trainY, out numTrainObs, out testX, out testY, out numTestObs, out numVars); int ka = 1; // => covariance updating algorithm double parm = penalty; double[] w = Enumerable.Repeat(1.0, numTrainObs).ToArray(); // all observations have the same weight int[] jd = new int[1]; // do not force to use any of the variables double[] vp = Enumerable.Repeat(1.0, numVars).ToArray(); // all predictor variables are unpenalized double[,] cl = new double[numVars, 2]; // use the same bounds for all coefficients for (int i = 0; i < numVars; i++) { cl[i, 0] = coeffLowerBound; cl[i, 1] = coeffUpperBound; } int ne = maxVars > 0 ? maxVars : numVars; int nx = numVars; double thr = 1.0e-5; // default value as recommended in glmnet int isd = 1; // => regression on standardized predictor variables int intr = 1; // => do include intercept in model int maxit = 100000; // default value as recommended in glmnet // outputs int lmu = -1; double[,] ca; int[] ia; int[] nin; int nlp = -99; int jerr = -99; double[] trainR2; Glmnet.elnet(ka, parm, numTrainObs, numVars, trainX, trainY, w, jd, vp, cl, ne, nx, nlam, flmin, ulam, thr, isd, intr, maxit, out lmu, out intercept, out ca, out ia, out nin, out trainR2, out lambda, out nlp, out jerr); trainNMSE = new double[lmu]; // elnet returns R**2 as 1 - NMSE testNMSE = new double[lmu]; coeff = new double[lmu, numVars]; for (int solIdx = 0; solIdx < lmu; solIdx++) { trainNMSE[solIdx] = 1.0 - trainR2[solIdx]; // uncompress coefficients of solution int selectedNin = nin[solIdx]; double[] coefficients; double[] selectedCa = new double[nx]; for (int i = 0; i < nx; i++) { selectedCa[i] = ca[solIdx, i]; } // apply to test set to calculate test NMSE values for each lambda step double[] fn; Glmnet.modval(intercept[solIdx], selectedCa, ia, selectedNin, numTestObs, testX, out fn); OnlineCalculatorError error; var nmse = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(testY, fn, out error); if (error != OnlineCalculatorError.None) nmse = double.MaxValue; testNMSE[solIdx] = nmse; // uncompress coefficients Glmnet.uncomp(numVars, selectedCa, ia, selectedNin, out coefficients); for (int i = 0; i < coefficients.Length; i++) { coeff[solIdx, i] = coefficients[i]; } } } private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY, out int numTrainObs, out double[,] testX, out double[] testY, out int numTestObs, out int numVars) { numVars = problemData.AllowedInputVariables.Count(); numTrainObs = problemData.TrainingIndices.Count(); numTestObs = problemData.TestIndices.Count(); trainX = new double[numVars, numTrainObs]; trainY = new double[numTrainObs]; testX = new double[numVars, numTestObs]; testY = new double[numTestObs]; var ds = problemData.Dataset; var targetVar = problemData.TargetVariable; // train int rIdx = 0; foreach (var row in problemData.TrainingIndices) { int cIdx = 0; foreach (var var in problemData.AllowedInputVariables) { trainX[cIdx, rIdx] = ds.GetDoubleValue(var, row); cIdx++; } trainY[rIdx] = ds.GetDoubleValue(targetVar, row); rIdx++; } // test rIdx = 0; foreach (var row in problemData.TestIndices) { int cIdx = 0; foreach (var var in problemData.AllowedInputVariables) { testX[cIdx, rIdx] = ds.GetDoubleValue(var, row); cIdx++; } testY[rIdx] = ds.GetDoubleValue(targetVar, row); rIdx++; } } } }