[15830] | 1 | #region License Information
|
---|
| 2 | /* HeuristicLab
|
---|
[17180] | 3 | * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
|
---|
[15830] | 4 | *
|
---|
| 5 | * This file is part of HeuristicLab.
|
---|
| 6 | *
|
---|
| 7 | * HeuristicLab is free software: you can redistribute it and/or modify
|
---|
| 8 | * it under the terms of the GNU General Public License as published by
|
---|
| 9 | * the Free Software Foundation, either version 3 of the License, or
|
---|
| 10 | * (at your option) any later version.
|
---|
| 11 | *
|
---|
| 12 | * HeuristicLab is distributed in the hope that it will be useful,
|
---|
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 15 | * GNU General Public License for more details.
|
---|
| 16 | *
|
---|
| 17 | * You should have received a copy of the GNU General Public License
|
---|
| 18 | * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
|
---|
| 19 | */
|
---|
| 20 | #endregion
|
---|
| 21 |
|
---|
| 22 | using System;
|
---|
| 23 | using System.Collections.Generic;
|
---|
| 24 | using System.Linq;
|
---|
| 25 | using HeuristicLab.Common;
|
---|
| 26 | using HeuristicLab.Problems.DataAnalysis;
|
---|
[16847] | 27 | using HEAL.Attic;
|
---|
[15830] | 28 |
|
---|
| 29 | namespace HeuristicLab.Algorithms.DataAnalysis {
|
---|
[17083] | 30 | // multidimensional extension of http://www2.stat.duke.edu/~tjl13/s101/slides/unit6lec3H.pdf
|
---|
[16847] | 31 | [StorableType("15F2295C-28C1-48C3-8DCB-9470823C6734")]
|
---|
[15967] | 32 | internal sealed class PreconstructedLinearModel : RegressionModel {
|
---|
[15830] | 33 | [Storable]
|
---|
| 34 | public Dictionary<string, double> Coefficients { get; private set; }
|
---|
| 35 | [Storable]
|
---|
| 36 | public double Intercept { get; private set; }
|
---|
| 37 |
|
---|
| 38 | public override IEnumerable<string> VariablesUsedForPrediction {
|
---|
| 39 | get { return Coefficients.Keys; }
|
---|
| 40 | }
|
---|
[16847] | 41 |
|
---|
[15830] | 42 | #region HLConstructors
|
---|
| 43 | [StorableConstructor]
|
---|
[16847] | 44 | private PreconstructedLinearModel(StorableConstructorFlag _) : base(_) { }
|
---|
[15830] | 45 | private PreconstructedLinearModel(PreconstructedLinearModel original, Cloner cloner) : base(original, cloner) {
|
---|
| 46 | if (original.Coefficients != null) Coefficients = original.Coefficients.ToDictionary(x => x.Key, x => x.Value);
|
---|
| 47 | Intercept = original.Intercept;
|
---|
| 48 | }
|
---|
[15967] | 49 | public PreconstructedLinearModel(Dictionary<string, double> coefficients, double intercept, string targetvariable) : base(targetvariable) {
|
---|
[16847] | 50 | Coefficients = new Dictionary<string, double>(coefficients);
|
---|
[15830] | 51 | Intercept = intercept;
|
---|
| 52 | }
|
---|
| 53 | public PreconstructedLinearModel(double intercept, string targetvariable) : base(targetvariable) {
|
---|
| 54 | Coefficients = new Dictionary<string, double>();
|
---|
| 55 | Intercept = intercept;
|
---|
| 56 | }
|
---|
| 57 | public override IDeepCloneable Clone(Cloner cloner) {
|
---|
| 58 | return new PreconstructedLinearModel(this, cloner);
|
---|
| 59 | }
|
---|
| 60 | #endregion
|
---|
| 61 |
|
---|
[15967] | 62 | public static PreconstructedLinearModel CreateLinearModel(IRegressionProblemData pd, out double rmse) {
|
---|
| 63 | return AlternativeCalculation(pd, out rmse);
|
---|
[15830] | 64 | }
|
---|
| 65 |
|
---|
[15967] | 66 | private static PreconstructedLinearModel ClassicCalculation(IRegressionProblemData pd) {
|
---|
[15830] | 67 | var inputMatrix = pd.Dataset.ToArray(pd.AllowedInputVariables.Concat(new[] {
|
---|
| 68 | pd.TargetVariable
|
---|
| 69 | }), pd.AllIndices);
|
---|
| 70 |
|
---|
| 71 | var nFeatures = inputMatrix.GetLength(1) - 1;
|
---|
| 72 | double[] coefficients;
|
---|
| 73 |
|
---|
| 74 | alglib.linearmodel lm;
|
---|
| 75 | alglib.lrreport ar;
|
---|
| 76 | int retVal;
|
---|
| 77 | alglib.lrbuild(inputMatrix, inputMatrix.GetLength(0), nFeatures, out retVal, out lm, out ar);
|
---|
| 78 | if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution");
|
---|
| 79 |
|
---|
| 80 | alglib.lrunpack(lm, out coefficients, out nFeatures);
|
---|
| 81 | var coeffs = pd.AllowedInputVariables.Zip(coefficients, (s, d) => new {s, d}).ToDictionary(x => x.s, x => x.d);
|
---|
[15967] | 82 | var res = new PreconstructedLinearModel(coeffs, coefficients[nFeatures], pd.TargetVariable);
|
---|
[15830] | 83 | return res;
|
---|
| 84 | }
|
---|
| 85 |
|
---|
[15967] | 86 | private static PreconstructedLinearModel AlternativeCalculation(IRegressionProblemData pd, out double rmse) {
|
---|
[15830] | 87 | var variables = pd.AllowedInputVariables.ToList();
|
---|
| 88 | var n = variables.Count;
|
---|
| 89 | var m = pd.TrainingIndices.Count();
|
---|
| 90 |
|
---|
[15967] | 91 | //Set up X^T
|
---|
[15830] | 92 | var inTr = new double[n + 1, m];
|
---|
| 93 | for (var i = 0; i < n; i++) {
|
---|
[15967] | 94 | var vdata = pd.Dataset.GetDoubleValues(variables[i], pd.TrainingIndices).ToArray();
|
---|
[15830] | 95 | for (var j = 0; j < m; j++) inTr[i, j] = vdata[j];
|
---|
| 96 | }
|
---|
| 97 | for (var i = 0; i < m; i++) inTr[n, i] = 1;
|
---|
| 98 |
|
---|
[15967] | 99 | //Set up y
|
---|
[15830] | 100 | var y = new double[m, 1];
|
---|
| 101 | var ydata = pd.TargetVariableTrainingValues.ToArray();
|
---|
| 102 | for (var i = 0; i < m; i++) y[i, 0] = ydata[i];
|
---|
| 103 |
|
---|
| 104 | //Perform linear regression
|
---|
| 105 | var aTy = new double[n + 1, 1];
|
---|
| 106 | var aTa = new double[n + 1, n + 1];
|
---|
[15967] | 107 | var aTyVector = new double[n + 1];
|
---|
[15830] | 108 | int info;
|
---|
| 109 | alglib.densesolverreport report;
|
---|
| 110 | double[] coefficients;
|
---|
[15967] | 111 |
|
---|
| 112 | //Perform linear regression
|
---|
| 113 | alglib.rmatrixgemm(n + 1, 1, m, 1, inTr, 0, 0, 0, y, 0, 0, 0, 0, ref aTy, 0, 0); //aTy = inTr * y;
|
---|
| 114 | alglib.rmatrixgemm(n + 1, n + 1, m, 1, inTr, 0, 0, 0, inTr, 0, 0, 1, 0, ref aTa, 0, 0); //aTa = inTr * t(inTr) +aTa //
|
---|
| 115 | alglib.spdmatrixcholesky(ref aTa, n + 1, true);
|
---|
[15830] | 116 | for (var i = 0; i < n + 1; i++) aTyVector[i] = aTy[i, 0];
|
---|
| 117 | alglib.spdmatrixcholeskysolve(aTa, n + 1, true, aTyVector, out info, out report, out coefficients);
|
---|
| 118 |
|
---|
[16852] | 119 | //if Cholesky calculation fails fall back to classic linear regresseion
|
---|
[15967] | 120 | if (info != 1) {
|
---|
| 121 | alglib.linearmodel lm;
|
---|
| 122 | alglib.lrreport ar;
|
---|
| 123 | int retVal;
|
---|
| 124 | var inputMatrix = pd.Dataset.ToArray(pd.AllowedInputVariables.Concat(new[] {
|
---|
| 125 | pd.TargetVariable
|
---|
| 126 | }), pd.AllIndices);
|
---|
| 127 | alglib.lrbuild(inputMatrix, inputMatrix.GetLength(0), n, out retVal, out lm, out ar);
|
---|
| 128 | if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution");
|
---|
| 129 | alglib.lrunpack(lm, out coefficients, out n);
|
---|
| 130 | }
|
---|
[15830] | 131 |
|
---|
[15967] | 132 | var coeffs = Enumerable.Range(0, n).ToDictionary(i => variables[i], i => coefficients[i]);
|
---|
| 133 | var model = new PreconstructedLinearModel(coeffs, coefficients[n], pd.TargetVariable);
|
---|
| 134 | rmse = pd.TrainingIndices.Select(i => pd.Dataset.GetDoubleValue(pd.TargetVariable, i) - model.GetEstimatedValue(pd.Dataset, i)).Sum(r => r * r) / m;
|
---|
| 135 | rmse = Math.Sqrt(rmse);
|
---|
| 136 | return model;
|
---|
[15830] | 137 | }
|
---|
| 138 |
|
---|
| 139 | public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
|
---|
| 140 | return rows.Select(row => GetEstimatedValue(dataset, row));
|
---|
| 141 | }
|
---|
| 142 |
|
---|
| 143 | public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
|
---|
| 144 | return new RegressionSolution(this, problemData);
|
---|
| 145 | }
|
---|
| 146 |
|
---|
| 147 | #region helpers
|
---|
| 148 | private double GetEstimatedValue(IDataset dataset, int row) {
|
---|
| 149 | return Intercept + (Coefficients.Count == 0 ? 0 : Coefficients.Sum(s => s.Value * dataset.GetDoubleValue(s.Key, row)));
|
---|
| 150 | }
|
---|
| 151 | #endregion
|
---|
| 152 | }
|
---|
| 153 | } |
---|