#region License Information /* HeuristicLab * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using System.Threading; using HeuristicLab.Analysis; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Optimization; using HeuristicLab.Parameters; using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; using HeuristicLab.Problems.DataAnalysis; using HeuristicLab.Problems.DataAnalysis.Symbolic; using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression; namespace HeuristicLab.Algorithms.DataAnalysis.Glmnet { [Item("Elastic-net Linear Regression (LR)", "Linear regression with elastic-net regularization (wrapper for glmnet)")] [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 110)] [StorableClass] public sealed class ElasticNetLinearRegression : FixedDataAnalysisAlgorithm { private const string PenalityParameterName = "Penality"; private const string LambdaParameterName = "Lambda"; #region parameters public IFixedValueParameter PenalityParameter { get { return (IFixedValueParameter)Parameters[PenalityParameterName]; } } public IValueParameter LambdaParameter { get { return (IValueParameter)Parameters[LambdaParameterName]; } } #endregion #region properties public double Penality { get { return PenalityParameter.Value.Value; } set { PenalityParameter.Value.Value = value; } } public DoubleValue Lambda { get { return LambdaParameter.Value; } set { LambdaParameter.Value = value; } } #endregion [StorableConstructor] private ElasticNetLinearRegression(bool deserializing) : base(deserializing) { } private ElasticNetLinearRegression(ElasticNetLinearRegression original, Cloner cloner) : base(original, cloner) { } public ElasticNetLinearRegression() : base() { Problem = new RegressionProblem(); Parameters.Add(new FixedValueParameter(PenalityParameterName, "Penalty factor (alpha) for balancing between ridge (0.0) and lasso (1.0) regression", new DoubleValue(0.5))); Parameters.Add(new OptionalValueParameter(LambdaParameterName, "Optional: the value of lambda for which to calculate an elastic-net solution. lambda == null => calculate the whole path of all lambdas")); } [StorableHook(HookType.AfterDeserialization)] private void AfterDeserialization() { } public override IDeepCloneable Clone(Cloner cloner) { return new ElasticNetLinearRegression(this, cloner); } protected override void Run(CancellationToken cancellationToken) { if (Lambda == null) { CreateSolutionPath(); } else { CreateSolution(Lambda.Value); } } private void CreateSolution(double lambda) { double trainNMSE; double testNMSE; var coeff = CalculateModelCoefficients(Problem.ProblemData, Penality, lambda, out trainNMSE, out testNMSE); Results.Add(new Result("NMSE (train)", new DoubleValue(trainNMSE))); Results.Add(new Result("NMSE (test)", new DoubleValue(testNMSE))); var solution = CreateSymbolicSolution(coeff, Problem.ProblemData); Results.Add(new Result(solution.Name, solution.Description, solution)); } public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData) { var ds = problemData.Dataset; var allVariables = problemData.AllowedInputVariables.ToArray(); var doubleVariables = allVariables.Where(ds.VariableHasType); var factorVariableNames = allVariables.Where(ds.VariableHasType); var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set) List>> remainingFactorVariablesAndValues = new List>>(); List factorCoeff = new List(); List remainingDoubleVariables = new List(); List doubleVarCoeff = new List(); { int i = 0; // find factor varibles & value combinations with non-zero coeff foreach (var factorVarAndValues in factorVariablesAndValues) { var l = new List(); foreach (var factorValue in factorVarAndValues.Value) { if (!coeff[i].IsAlmost(0.0)) { l.Add(factorValue); factorCoeff.Add(coeff[i]); } i++; } if (l.Any()) remainingFactorVariablesAndValues.Add(new KeyValuePair>(factorVarAndValues.Key, l)); } // find double variables with non-zero coeff foreach (var doubleVar in doubleVariables) { if (!coeff[i].IsAlmost(0.0)) { remainingDoubleVariables.Add(doubleVar); doubleVarCoeff.Add(coeff[i]); } i++; } } var tree = LinearModelToTreeConverter.CreateTree( remainingFactorVariablesAndValues, factorCoeff.ToArray(), remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(), coeff.Last()); SymbolicRegressionSolution solution = new SymbolicRegressionSolution( new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()), (IRegressionProblemData)problemData.Clone()); solution.Model.Name = "Elastic-net Linear Regression Model"; solution.Name = "Elastic-net Linear Regression Solution"; return solution; } private void CreateSolutionPath() { double[] lambda; double[] trainNMSE; double[] testNMSE; double[,] coeff; double[] intercept; RunElasticNetLinearRegression(Problem.ProblemData, Penality, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept); var coeffTable = new IndexedDataTable("Coefficients", "The paths of standarized coefficient values over different lambda values"); coeffTable.VisualProperties.YAxisMaximumAuto = false; coeffTable.VisualProperties.YAxisMinimumAuto = false; coeffTable.VisualProperties.XAxisMaximumAuto = false; coeffTable.VisualProperties.XAxisMinimumAuto = false; coeffTable.VisualProperties.XAxisLogScale = true; coeffTable.VisualProperties.XAxisTitle = "Lambda"; coeffTable.VisualProperties.YAxisTitle = "Coefficients"; coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables"; var nLambdas = lambda.Length; var nCoeff = coeff.GetLength(1); var dataRows = new IndexedDataRow[nCoeff]; var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray(); var numNonZeroCoeffs = new int[nLambdas]; var ds = Problem.ProblemData.Dataset; var doubleVariables = allowedVars.Where(ds.VariableHasType); var factorVariableNames = allowedVars.Where(ds.VariableHasType); var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set) { int i = 0; foreach (var factorVariableAndValues in factorVariablesAndValues) { foreach (var factorValue in factorVariableAndValues.Value) { double sigma = ds.GetStringValues(factorVariableAndValues.Key) .Select(s => s == factorValue ? 1.0 : 0.0) .StandardDeviation(); // calc std dev of binary indicator var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray(); dataRows[i] = new IndexedDataRow(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path); i++; } } foreach (var doubleVariable in doubleVariables) { double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation(); var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray(); dataRows[i] = new IndexedDataRow(doubleVariable, doubleVariable, path); i++; } // add to coeffTable by total weight (larger area under the curve => more important); foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) { coeffTable.Rows.Add(r); } } for (int i = 0; i < coeff.GetLength(0); i++) { for (int j = 0; j < coeff.GetLength(1); j++) { if (!coeff[i, j].IsAlmost(0.0)) { numNonZeroCoeffs[i]++; } } } if (lambda.Length > 2) { coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last()))); coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First()))); } coeffTable.Rows.Add(new IndexedDataRow("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v)))); coeffTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points; coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true; Results.Add(new Result(coeffTable.Name, coeffTable.Description, coeffTable)); var errorTable = new IndexedDataTable("NMSE", "Path of NMSE values over different lambda values"); errorTable.VisualProperties.YAxisMaximumAuto = false; errorTable.VisualProperties.YAxisMinimumAuto = false; errorTable.VisualProperties.XAxisMaximumAuto = false; errorTable.VisualProperties.XAxisMinimumAuto = false; errorTable.VisualProperties.YAxisMinimumFixedValue = 0; errorTable.VisualProperties.YAxisMaximumFixedValue = 1.0; errorTable.VisualProperties.XAxisLogScale = true; errorTable.VisualProperties.XAxisTitle = "Lambda"; errorTable.VisualProperties.YAxisTitle = "Normalized mean of squared errors (NMSE)"; errorTable.VisualProperties.SecondYAxisTitle = "Number of variables"; errorTable.Rows.Add(new IndexedDataRow("NMSE (train)", "Path of NMSE values over different lambda values", lambda.Zip(trainNMSE, (l, v) => Tuple.Create(l, v)))); errorTable.Rows.Add(new IndexedDataRow("NMSE (test)", "Path of NMSE values over different lambda values", lambda.Zip(testNMSE, (l, v) => Tuple.Create(l, v)))); errorTable.Rows.Add(new IndexedDataRow("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v)))); if (lambda.Length > 2) { errorTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last()))); errorTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First()))); } errorTable.Rows["NMSE (train)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points; errorTable.Rows["NMSE (test)"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points; errorTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points; errorTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true; Results.Add(new Result(errorTable.Name, errorTable.Description, errorTable)); } public static double[] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double lambda, out double trainNMSE, out double testNMSE, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) { double[] trainNMSEs; double[] testNMSEs; // run for exactly one lambda var coeffs = CalculateModelCoefficients(problemData, penalty, new double[] { lambda }, out trainNMSEs, out testNMSEs, coeffLowerBound, coeffUpperBound); trainNMSE = trainNMSEs[0]; testNMSE = testNMSEs[0]; return coeffs[0]; } public static double[][] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double[] lambda, out double[] trainNMSEs, out double[] testNMSEs, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1) { // run for multiple user-supplied lambdas double[,] coeff; double[] intercept; RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainNMSEs, out testNMSEs, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars); int nRows = intercept.Length; int nCols = coeff.GetLength(1) + 1; double[][] sols = new double[nRows][]; for (int solIdx = 0; solIdx < nRows; solIdx++) { sols[solIdx] = new double[nCols]; for (int cIdx = 0; cIdx < nCols - 1; cIdx++) { sols[solIdx][cIdx] = coeff[solIdx, cIdx]; } sols[solIdx][nCols - 1] = intercept[solIdx]; } return sols; } public static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1 ) { double[] userLambda = new double[0]; // automatically determine lambda values (maximum 100 different lambda values) RunElasticNetLinearRegression(problemData, penalty, 100, 0.0, userLambda, out lambda, out trainNMSE, out testNMSE, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars); } /// /// Elastic net with squared-error-loss for dense predictor matrix, runs the full path of all lambdas /// /// Predictor target matrix x and target vector y /// Penalty for balance between ridge (0.0) and lasso (1.0) regression /// Maximum number of lambda values (default 100) /// User control of lambda values (<1.0 => minimum lambda = flmin * (largest lambda value), >= 1.0 => use supplied lambda values /// User supplied lambda values /// Output lambda values /// Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the training set for each set of coefficients along the path /// Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the test set for each set of coefficients along the path /// Vector of coefficient vectors for each solution along the path /// Vector of intercepts for each solution along the path /// Optional lower bound for all coefficients /// Optional upper bound for all coefficients /// Maximum allowed number of variables in each solution along the path (-1 => all variables are allowed) private static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty, int nlam, double flmin, double[] ulam, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1 ) { if (penalty < 0.0 || penalty > 1.0) throw new ArgumentException("0 <= penalty <= 1", "penalty"); double[,] trainX; double[,] testX; double[] trainY; double[] testY; PrepareData(problemData, out trainX, out trainY, out testX, out testY); var numTrainObs = trainX.GetLength(1); var numTestObs = testX.GetLength(1); var numVars = trainX.GetLength(0); int ka = 1; // => covariance updating algorithm double parm = penalty; double[] w = Enumerable.Repeat(1.0, numTrainObs).ToArray(); // all observations have the same weight int[] jd = new int[1]; // do not force to use any of the variables double[] vp = Enumerable.Repeat(1.0, numVars).ToArray(); // all predictor variables are unpenalized double[,] cl = new double[numVars, 2]; // use the same bounds for all coefficients for (int i = 0; i < numVars; i++) { cl[i, 0] = coeffLowerBound; cl[i, 1] = coeffUpperBound; } int ne = maxVars > 0 ? maxVars : numVars; int nx = numVars; double thr = 1.0e-5; // default value as recommended in glmnet int isd = 1; // => regression on standardized predictor variables int intr = 1; // => do include intercept in model int maxit = 100000; // default value as recommended in glmnet // outputs int lmu = -1; double[,] ca; int[] ia; int[] nin; int nlp = -99; int jerr = -99; double[] trainR2; Glmnet.elnet(ka, parm, numTrainObs, numVars, trainX, trainY, w, jd, vp, cl, ne, nx, nlam, flmin, ulam, thr, isd, intr, maxit, out lmu, out intercept, out ca, out ia, out nin, out trainR2, out lambda, out nlp, out jerr); trainNMSE = new double[lmu]; // elnet returns R**2 as 1 - NMSE testNMSE = new double[lmu]; coeff = new double[lmu, numVars]; for (int solIdx = 0; solIdx < lmu; solIdx++) { trainNMSE[solIdx] = 1.0 - trainR2[solIdx]; // uncompress coefficients of solution int selectedNin = nin[solIdx]; double[] coefficients; double[] selectedCa = new double[nx]; for (int i = 0; i < nx; i++) { selectedCa[i] = ca[solIdx, i]; } // apply to test set to calculate test NMSE values for each lambda step double[] fn; Glmnet.modval(intercept[solIdx], selectedCa, ia, selectedNin, numTestObs, testX, out fn); OnlineCalculatorError error; var nmse = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(testY, fn, out error); if (error != OnlineCalculatorError.None) nmse = double.NaN; testNMSE[solIdx] = nmse; // uncompress coefficients Glmnet.uncomp(numVars, selectedCa, ia, selectedNin, out coefficients); for (int i = 0; i < coefficients.Length; i++) { coeff[solIdx, i] = coefficients[i]; } } } private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY, out double[,] testX, out double[] testY) { var ds = problemData.Dataset; var targetVariable = problemData.TargetVariable; var allowedInputs = problemData.AllowedInputVariables; trainX = PrepareInputData(ds, allowedInputs, problemData.TrainingIndices); trainY = ds.GetDoubleValues(targetVariable, problemData.TrainingIndices).ToArray(); testX = PrepareInputData(ds, allowedInputs, problemData.TestIndices); testY = ds.GetDoubleValues(targetVariable, problemData.TestIndices).ToArray(); } private static double[,] PrepareInputData(IDataset ds, IEnumerable allowedInputs, IEnumerable rows) { var doubleVariables = allowedInputs.Where(ds.VariableHasType); var factorVariableNames = allowedInputs.Where(ds.VariableHasType); var factorVariables = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set) double[,] binaryMatrix = ds.ToArray(factorVariables, rows); double[,] doubleVarMatrix = ds.ToArray(doubleVariables, rows); var x = binaryMatrix.HorzCat(doubleVarMatrix); return x.Transpose(); } } }