using System; using System.Linq; using System.Runtime.InteropServices; using HeuristicLab.Algorithms.DataAnalysis; using HeuristicLab.Analysis; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; using HeuristicLab.Optimization; using HeuristicLab.Parameters; using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; using HeuristicLab.Problems.DataAnalysis; using HeuristicLab.Problems.DataAnalysis.Symbolic; using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression; namespace HeuristicLab.LibGlmNet { [Item("Elastic-net Linear Regression (LR)", "Linear regression with elastic-net regularization (wrapper for glmnet)")] [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 110)] [StorableClass] public sealed class ElasticNetLinearRegression : FixedDataAnalysisAlgorithm { private const string PenalityParameterName = "Penality"; private const string LogLambdaParameterName = "Log10(Lambda)"; #region parameters public IFixedValueParameter PenalityParameter { get { return (IFixedValueParameter)Parameters[PenalityParameterName]; } } public IValueParameter LogLambdaParameter { get { return (IValueParameter)Parameters[LogLambdaParameterName]; } } #endregion #region properties public double Penality { get { return PenalityParameter.Value.Value; } set { PenalityParameter.Value.Value = value; } } public DoubleValue LogLambda { get { return LogLambdaParameter.Value; } set { LogLambdaParameter.Value = value; } } #endregion [StorableConstructor] private ElasticNetLinearRegression(bool deserializing) : base(deserializing) { } private ElasticNetLinearRegression(ElasticNetLinearRegression original, Cloner cloner) : base(original, cloner) { } public ElasticNetLinearRegression() : base() { Problem = new RegressionProblem(); Parameters.Add(new FixedValueParameter(PenalityParameterName, "Penalty factor for balancing between ridge (0.0) and lasso (1.0) regression", new DoubleValue(0.5))); Parameters.Add(new OptionalValueParameter(LogLambdaParameterName, "Optional: the value of lambda for which to calculate an elastic-net solution. lambda == null => calculate the whole path of all lambdas")); } [StorableHook(HookType.AfterDeserialization)] private void AfterDeserialization() { } public override IDeepCloneable Clone(Cloner cloner) { return new ElasticNetLinearRegression(this, cloner); } protected override void Run() { if (LogLambda == null) { CreateSolutionPath(); } else { CreateSolution(LogLambda.Value); } } private void CreateSolution(double logLambda) { double trainRsq; double testRsq; var coeff = CreateElasticNetLinearRegressionSolution(Problem.ProblemData, Penality, Math.Pow(10, logLambda), out trainRsq, out testRsq); Results.Add(new Result("R² (train)", new DoubleValue(trainRsq))); Results.Add(new Result("R² (test)", new DoubleValue(testRsq))); // copied from LR => TODO: reuse code (but skip coefficients = 0.0) ISymbolicExpressionTree tree = new SymbolicExpressionTree(new ProgramRootSymbol().CreateTreeNode()); ISymbolicExpressionTreeNode startNode = new StartSymbol().CreateTreeNode(); tree.Root.AddSubtree(startNode); ISymbolicExpressionTreeNode addition = new Addition().CreateTreeNode(); startNode.AddSubtree(addition); int col = 0; foreach (string column in Problem.ProblemData.AllowedInputVariables) { if (!coeff[col].IsAlmost(0.0)) { VariableTreeNode vNode = (VariableTreeNode)new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode(); vNode.VariableName = column; vNode.Weight = coeff[col]; addition.AddSubtree(vNode); } col++; } if (!coeff[coeff.Length - 1].IsAlmost(0.0)) { ConstantTreeNode cNode = (ConstantTreeNode)new Constant().CreateTreeNode(); cNode.Value = coeff[coeff.Length - 1]; addition.AddSubtree(cNode); } SymbolicRegressionSolution solution = new SymbolicRegressionSolution( new SymbolicRegressionModel(Problem.ProblemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()), (IRegressionProblemData)Problem.ProblemData.Clone()); solution.Model.Name = "Elastic-net Linear Regression Model"; solution.Name = "Elastic-net Linear Regression Solution"; Results.Add(new Result(solution.Name, solution.Description, solution)); } private void CreateSolutionPath() { double[] lambda; double[] trainRsq; double[] testRsq; double[,] coeff; double[] intercept; RunElasticNetLinearRegression(Problem.ProblemData, Penality, out lambda, out trainRsq, out testRsq, out coeff, out intercept); var coeffTable = new DataTable("Coefficient Paths", "The paths of standarized coefficient values over different lambda values"); var nLambdas = lambda.Length; var nCoeff = coeff.GetLength(1); var dataRows = new DataRow[nCoeff]; var allowedVars = Problem.ProblemData.AllowedInputVariables.ToArray(); for (int i = 0; i < nCoeff; i++) { var coeffId = allowedVars[i]; double sigma = Problem.ProblemData.Dataset.GetDoubleValues(coeffId).StandardDeviation(); var path = Enumerable.Range(0, nLambdas).Select(r => coeff[r, i] * sigma).ToArray(); dataRows[i] = new DataRow(coeffId, coeffId, path); coeffTable.Rows.Add(dataRows[i]); } Results.Add(new Result(coeffTable.Name, coeffTable.Description, coeffTable)); var rsqPlot = new ScatterPlot("R-Squared", "Path of R² values over different lambda values"); rsqPlot.VisualProperties.YAxisMaximumAuto = false; rsqPlot.VisualProperties.YAxisMinimumAuto = false; rsqPlot.VisualProperties.XAxisMaximumAuto = false; rsqPlot.VisualProperties.XAxisMinimumAuto = false; rsqPlot.VisualProperties.YAxisMinimumFixedValue = 0; rsqPlot.VisualProperties.YAxisMaximumFixedValue = 1.0; rsqPlot.VisualProperties.XAxisTitle = "Log10(Lambda)"; rsqPlot.VisualProperties.YAxisTitle = "R²"; rsqPlot.Rows.Add(new ScatterPlotDataRow("R² (train)", "Path of R² values over different lambda values", lambda.Zip(trainRsq, (l, r) => new Point2D(Math.Log10(l), r)))); rsqPlot.Rows.Add(new ScatterPlotDataRow("R² (test)", "Path of R² values over different lambda values", lambda.Zip(testRsq, (l, r) => new Point2D(Math.Log10(l), r)))); if (lambda.Length > 2) { rsqPlot.VisualProperties.XAxisMinimumFixedValue = Math.Floor(Math.Log10(lambda.Last())); rsqPlot.VisualProperties.XAxisMaximumFixedValue = Math.Ceiling(Math.Log10(lambda.Skip(1).First())); } rsqPlot.Rows["R² (train)"].VisualProperties.PointSize = 5; rsqPlot.Rows["R² (test)"].VisualProperties.PointSize = 5; Results.Add(new Result(rsqPlot.Name, rsqPlot.Description, rsqPlot)); } public static double[] CreateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double lambda, out double trainRsq, out double testRsq, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity) { double[] trainRsqs; double[] testRsqs; // run for exactly one lambda var coeffs = CreateElasticNetLinearRegressionSolution(problemData, penalty, new double[] { lambda }, out trainRsqs, out testRsqs, coeffLowerBound, coeffUpperBound); trainRsq = trainRsqs[0]; testRsq = testRsqs[0]; return coeffs[0]; } public static double[][] CreateElasticNetLinearRegressionSolution(IRegressionProblemData problemData, double penalty, double[] lambda, out double[] trainRsq, out double[] testRsq, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1) { // run for multiple user-supplied lambdas double[,] coeff; double[] intercept; RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainRsq, out testRsq, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars); int nRows = intercept.Length; int nCols = coeff.GetLength(1) + 1; double[][] sols = new double[nRows][]; for (int solIdx = 0; solIdx < nRows; solIdx++) { sols[solIdx] = new double[nCols]; for (int cIdx = 0; cIdx < nCols - 1; cIdx++) { sols[solIdx][cIdx] = coeff[solIdx, cIdx]; } sols[solIdx][nCols - 1] = intercept[solIdx]; } return sols; } public static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty, out double[] lambda, out double[] trainRsq, out double[] testRsq, out double[,] coeff, out double[] intercept, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1 ) { double[] userLambda = new double[0]; // automatically determine lambda values (maximum 100 different lambda values) RunElasticNetLinearRegression(problemData, penalty, 100, 0.0, userLambda, out lambda, out trainRsq, out testRsq, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars); } /// /// Elastic net with squared-error-loss for dense predictor matrix, runs the full path of all lambdas /// /// Predictor target matrix x and target vector y /// Penalty for balance between ridge (0.0) and lasso (1.0) regression /// Maximum number of lambda values (default 100) /// User control of lambda values (<1.0 => minimum lambda = flmin * (largest lambda value), >= 1.0 => use supplied lambda values /// User supplied lambda values /// Output lambda values /// Vector of R² values on the training set for each set of coefficients along the path /// Vector of R² values on the test set for each set of coefficients along the path /// Vector of coefficient vectors for each solution along the path /// Vector of intercepts for each solution along the path /// Optional lower bound for all coefficients /// Optional upper bound for all coefficients /// Maximum allowed number of variables in each solution along the path (-1 => all variables are allowed) private static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty, int nlam, double flmin, double[] ulam, out double[] lambda, out double[] trainRsq, out double[] testRsq, out double[,] coeff, out double[] intercept, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1 ) { if (penalty < 0.0 || penalty > 1.0) throw new ArgumentException("0 <= penalty <= 1", "penalty"); double[,] trainX; double[,] testX; double[] trainY; double[] testY; int numTrainObs, numTestObs; int numVars; PrepareData(problemData, out trainX, out trainY, out numTrainObs, out testX, out testY, out numTestObs, out numVars); int ka = 1; // => covariance updating algorithm double parm = penalty; double[] w = Enumerable.Repeat(1.0, numTrainObs).ToArray(); // all observations have the same weight int[] jd = new int[1]; // do not force to use any of the variables double[] vp = Enumerable.Repeat(1.0, numVars).ToArray(); // all predictor variables are unpenalized double[,] cl = new double[numVars, 2]; // use the same bounds for all coefficients for (int i = 0; i < numVars; i++) { cl[i, 0] = coeffLowerBound; cl[i, 1] = coeffUpperBound; } int ne = maxVars > 0 ? maxVars : numVars; int nx = numVars; double thr = 1.0e-5; // default value as recommended in glmnet int isd = 1; // => regression on standardized predictor variables int intr = 1; // => do include intercept in model int maxit = 100000; // default value as recommended in glmnet // outputs int lmu = -1; double[,] ca; int[] ia; int[] nin; int nlp = -99; int jerr = -99; elnet(ka, parm, numTrainObs, numVars, trainX, trainY, w, jd, vp, cl, ne, nx, nlam, flmin, ulam, thr, isd, intr, maxit, out lmu, out intercept, out ca, out ia, out nin, out trainRsq, out lambda, out nlp, out jerr); testRsq = new double[lmu]; coeff = new double[lmu, numVars]; for (int solIdx = 0; solIdx < lmu; solIdx++) { // uncompress coefficients of solution int selectedNin = nin[solIdx]; double[] coefficients; double[] selectedCa = new double[nx]; for (int i = 0; i < nx; i++) { selectedCa[i] = ca[solIdx, i]; } // apply to test set to calculate test R² values for each lambda step double[] fn; modval(intercept[solIdx], selectedCa, ia, selectedNin, numTestObs, testX, out fn); OnlineCalculatorError error; var r = OnlinePearsonsRCalculator.Calculate(testY, fn, out error); if (error != OnlineCalculatorError.None) r = 0; testRsq[solIdx] = r * r; // uncompress coefficients uncomp(numVars, selectedCa, ia, selectedNin, out coefficients); for (int i = 0; i < coefficients.Length; i++) { coeff[solIdx, i] = coefficients[i]; } } } private static void PrepareData(IRegressionProblemData problemData, out double[,] trainX, out double[] trainY, out int numTrainObs, out double[,] testX, out double[] testY, out int numTestObs, out int numVars) { numVars = problemData.AllowedInputVariables.Count(); numTrainObs = problemData.TrainingIndices.Count(); numTestObs = problemData.TestIndices.Count(); trainX = new double[numVars, numTrainObs]; trainY = new double[numTrainObs]; testX = new double[numVars, numTestObs]; testY = new double[numTestObs]; var ds = problemData.Dataset; var targetVar = problemData.TargetVariable; // train int rIdx = 0; foreach (var row in problemData.TrainingIndices) { int cIdx = 0; foreach (var var in problemData.AllowedInputVariables) { trainX[cIdx, rIdx] = ds.GetDoubleValue(var, row); cIdx++; } trainY[rIdx] = ds.GetDoubleValue(targetVar, row); rIdx++; } // test rIdx = 0; foreach (var row in problemData.TestIndices) { int cIdx = 0; foreach (var var in problemData.AllowedInputVariables) { testX[cIdx, rIdx] = ds.GetDoubleValue(var, row); cIdx++; } testY[rIdx] = ds.GetDoubleValue(targetVar, row); rIdx++; } } #region dllimport /// Wrapper for elnet procedure in glmnet library /// (see: https://cran.r-project.org/web/packages/glmnet/index.html) /// /// ka = algorithm flag /// ka=1 => covariance updating algorithm /// ka=2 => naive algorithm /// parm = penalty member index(0<= parm <= 1) /// = 0.0 => ridge /// = 1.0 => lasso /// no = number of observations /// ni = number of predictor variables /// y(no) = response vector(overwritten) /// w(no)= observation weights(overwritten) /// jd(jd(1)+1) = predictor variable deletion flag /// jd(1) = 0 => use all variables /// jd(1) != 0 => do not use variables jd(2)...jd(jd(1)+1) /// vp(ni) = relative penalties for each predictor variable /// vp(j) = 0 => jth variable unpenalized /// cl(2, ni) = interval constraints on coefficient values(overwritten) /// cl(1, j) = lower bound for jth coefficient value(<= 0.0) /// cl(2, j) = upper bound for jth coefficient value(>= 0.0) /// ne = maximum number of variables allowed to enter largest model /// (stopping criterion) /// nx = maximum number of variables allowed to enter all modesl /// along path(memory allocation, nx > ne). /// nlam = (maximum)number of lamda values /// flmin = user control of lamda values(>=0) /// flmin< 1.0 => minimum lamda = flmin * (largest lamda value) /// flmin >= 1.0 => use supplied lamda values(see below) /// ulam(nlam) = user supplied lamda values(ignored if flmin< 1.0) /// thr = convergence threshold for each lamda solution. /// iterations stop when the maximum reduction in the criterion value /// as a result of each parameter update over a single pass /// is less than thr times the null criterion value. /// (suggested value, thr= 1.0e-5) /// isd = predictor variable standarization flag: /// isd = 0 => regression on original predictor variables /// isd = 1 => regression on standardized predictor variables /// Note: output solutions always reference original /// variables locations and scales. /// intr = intercept flag /// intr = 0 / 1 => don't/do include intercept in model /// maxit = maximum allowed number of passes over the data for all lambda /// values (suggested values, maxit = 100000) /// /// output: /// /// lmu = actual number of lamda values(solutions) /// a0(lmu) = intercept values for each solution /// ca(nx, lmu) = compressed coefficient values for each solution /// ia(nx) = pointers to compressed coefficients /// nin(lmu) = number of compressed coefficients for each solution /// rsq(lmu) = R**2 values for each solution /// alm(lmu) = lamda values corresponding to each solution /// nlp = actual number of passes over the data for all lamda values /// jerr = error flag: /// jerr = 0 => no error /// jerr > 0 => fatal error - no output returned /// jerr< 7777 => memory allocation error /// jerr = 7777 => all used predictors have zero variance /// jerr = 10000 => maxval(vp) <= 0.0 /// jerr< 0 => non fatal error - partial output: /// Solutions for larger lamdas (1:(k-1)) returned. /// jerr = -k => convergence for kth lamda value not reached /// after maxit(see above) iterations. /// jerr = -10000 - k => number of non zero coefficients along path /// exceeds nx(see above) at kth lamda value. /// private static void elnet( int ka, double parm, int no, int ni, double[,] x, double[] y, double[] w, int[] jd, double[] vp, double[,] cl, int ne, int nx, int nlam, double flmin, double[] ulam, double thr, int isd, int intr, int maxit, // outputs out int lmu, out double[] a0, out double[,] ca, out int[] ia, out int[] nin, out double[] rsq, out double[] alm, out int nlp, out int jerr ) { // initialize output values and allocate arrays big enough a0 = new double[nlam]; ca = new double[nlam, nx]; ia = new int[nx]; nin = new int[nlam]; rsq = new double[nlam]; alm = new double[nlam]; nlp = -1; jerr = -1; lmu = -1; // load correct version of native dll based on process (x86/x64) if (Environment.Is64BitProcess) { elnet_x64(ref ka, ref parm, ref no, ref ni, x, y, w, jd, vp, cl, ref ne, ref ni, ref nlam, ref flmin, ulam, ref thr, ref isd, ref intr, ref maxit, ref lmu, a0, ca, ia, nin, rsq, alm, ref nlp, ref jerr); } else { elnet_x86(ref ka, ref parm, ref no, ref ni, x, y, w, jd, vp, cl, ref ne, ref ni, ref nlam, ref flmin, ulam, ref thr, ref isd, ref intr, ref maxit, ref lmu, a0, ca, ia, nin, rsq, alm, ref nlp, ref jerr); } // jerr = error flag: // jerr = 0 => no error // jerr > 0 => fatal error -no output returned // jerr < 7777 => memory allocation error // jerr = 7777 => all used predictors have zero variance // jerr = 10000 => maxval(vp) <= 0.0 // jerr < 0 => non fatal error - partial output: // c Solutions for larger lamdas (1:(k - 1)) returned. // jerr = -k => convergence for kth lamda value not reached // after maxit(see above) iterations. // jerr = -10000 - k => number of non zero coefficients along path // exceeds nx(see above) at kth lamda value. if (jerr != 0) { if (jerr > 0 && jerr < 7777) throw new InvalidOperationException("glmnet: memory allocation error"); else if (jerr == 7777) throw new InvalidOperationException("glmnet: all used predictors have zero variance"); else if (jerr == 10000) throw new InvalidOperationException("glmnet: maxval(vp) <= 0.0"); else if (jerr < 0 && jerr > -1000) throw new InvalidOperationException(string.Format("glmnet: convergence for {0}th lamda value not reached after maxit iterations ", -jerr)); else if (jerr <= -10000) throw new InvalidOperationException(string.Format("glmnet: number of non zero coefficients along path exceeds number of maximally allowed variables (nx) at {0}th lamda value", -jerr - 10000)); else throw new InvalidOperationException(string.Format("glmnet: error {0}", jerr)); } // resize arrays to the capacity that is acutally necessary for the results Array.Resize(ref a0, lmu); Array.Resize(ref nin, lmu); Array.Resize(ref rsq, lmu); Array.Resize(ref alm, lmu); } [DllImport("glmnet-x86.dll", EntryPoint = "elnet_", CallingConvention = CallingConvention.Cdecl)] private static extern void elnet_x86( ref int ka, ref double parm, ref int no, ref int ni, double[,] x, double[] y, double[] w, int[] jd, double[] vp, double[,] cl, ref int ne, ref int nx, ref int nlam, ref double flmin, double[] ulam, ref double thr, ref int isd, ref int intr, ref int maxit, // outputs: ref int lmu, [Out] double[] a0, [Out] double[,] ca, [Out] int[] ia, [Out] int[] nin, [Out] double[] rsq, [Out] double[] alm, ref int nlp, ref int jerr ); [DllImport("glmnet-x64.dll", EntryPoint = "elnet_", CallingConvention = CallingConvention.Cdecl)] private static extern void elnet_x64( ref int ka, ref double parm, ref int no, ref int ni, double[,] x, double[] y, double[] w, int[] jd, double[] vp, double[,] cl, ref int ne, ref int nx, ref int nlam, ref double flmin, double[] ulam, ref double thr, ref int isd, ref int intr, ref int maxit, // outputs: ref int lmu, [Out] double[] a0, [Out] double[,] ca, [Out] int[] ia, [Out] int[] nin, [Out] double[] rsq, [Out] double[] alm, ref int nlp, ref int jerr ); /// Wrapper for uncompress coefficient vector for particular solution in glmnet /// (see: https://cran.r-project.org/web/packages/glmnet/index.html) /// /// call uncomp(ni, ca, ia, nin, a) /// /// input: /// /// ni = total number of predictor variables /// ca(nx) = compressed coefficient values for the solution /// ia(nx) = pointers to compressed coefficients /// nin = number of compressed coefficients for the solution /// /// output: /// /// a(ni) = uncompressed coefficient vector /// referencing original variables /// private static void uncomp(int numVars, double[] ca, int[] ia, int nin, out double[] a) { a = new double[numVars]; // load correct version of native dll based on process (x86/x64) if (Environment.Is64BitProcess) { uncomp_x64(ref numVars, ca, ia, ref nin, a); } else { uncomp_x86(ref numVars, ca, ia, ref nin, a); } } [DllImport("glmnet-x86.dll", EntryPoint = "uncomp_", CallingConvention = CallingConvention.Cdecl)] private static extern void uncomp_x86(ref int numVars, double[] ca, int[] ia, ref int nin, double[] a); [DllImport("glmnet-x64.dll", EntryPoint = "uncomp_", CallingConvention = CallingConvention.Cdecl)] private static extern void uncomp_x64(ref int numVars, double[] ca, int[] ia, ref int nin, double[] a); private static void modval(double a0, double[] ca, int[] ia, int nin, int numObs, double[,] x, out double[] fn) { fn = new double[numObs]; if (Environment.Is64BitProcess) { modval_x64(ref a0, ca, ia, ref nin, ref numObs, x, fn); } else { modval_x86(ref a0, ca, ia, ref nin, ref numObs, x, fn); } } // evaluate linear model from compressed coefficients and // uncompressed predictor matrix: // // call modval(a0, ca, ia, nin, n, x, f); // c // c input: // // a0 = intercept // ca(nx) = compressed coefficient values for a solution // ia(nx) = pointers to compressed coefficients // nin = number of compressed coefficients for solution // n = number of predictor vectors(observations) // x(n, ni) = full(uncompressed) predictor matrix // // output: // // f(n) = model predictions [DllImport("glmnet-x86.dll", EntryPoint = "modval_", CallingConvention = CallingConvention.Cdecl)] private static extern void modval_x86(ref double a0, double[] ca, int[] ia, ref int nin, ref int numObs, [Out] double[,] x, double[] fn); [DllImport("glmnet-x64.dll", EntryPoint = "modval_", CallingConvention = CallingConvention.Cdecl)] private static extern void modval_x64(ref double a0, double[] ca, int[] ia, ref int nin, ref int numObs, [Out] double[,] x, double[] fn); #endregion } }