1  #region License Information


2  /* HeuristicLab


3  * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)


4  *


5  * This file is part of HeuristicLab.


6  *


7  * HeuristicLab is free software: you can redistribute it and/or modify


8  * it under the terms of the GNU General Public License as published by


9  * the Free Software Foundation, either version 3 of the License, or


10  * (at your option) any later version.


11  *


12  * HeuristicLab is distributed in the hope that it will be useful,


13  * but WITHOUT ANY WARRANTY; without even the implied warranty of


14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the


15  * GNU General Public License for more details.


16  *


17  * You should have received a copy of the GNU General Public License


18  * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.


19  */


20  #endregion


21 


22  using System;


23  using System.Collections.Generic;


24  using System.Linq;


25  using System.Threading;


26  using HeuristicLab.Common;


27  using HeuristicLab.Core;


28  using HeuristicLab.Data;


29  using HeuristicLab.Optimization;


30  using HEAL.Attic;


31  using HeuristicLab.Problems.DataAnalysis;


32  using HeuristicLab.Problems.DataAnalysis.Symbolic;


33  using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;


34  using HeuristicLab.Analysis;


35  using HeuristicLab.Analysis.Statistics;


36 


37  namespace HeuristicLab.Algorithms.DataAnalysis {


38  /// <summary>


39  /// Linear regression data analysis algorithm.


40  /// </summary>


41  [Item("Linear Regression (LR)", "Linear regression data analysis algorithm (wrapper for ALGLIB).")]


42  [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 100)]


43  [StorableType("CF99D45EF341445E9B9E0587A8D9CBA7")]


44  public sealed class LinearRegression : FixedDataAnalysisAlgorithm<IRegressionProblem> {


45  private const string SolutionResultName = "Linear regression solution";


46  private const string ConfidenceSolutionResultName = "Solution with prediction intervals";


47 


48  [StorableConstructor]


49  private LinearRegression(StorableConstructorFlag _) : base(_) { }


50  private LinearRegression(LinearRegression original, Cloner cloner)


51  : base(original, cloner) {


52  }


53  public LinearRegression()


54  : base() {


55  Problem = new RegressionProblem();


56  }


57  [StorableHook(HookType.AfterDeserialization)]


58  private void AfterDeserialization() { }


59 


60  public override IDeepCloneable Clone(Cloner cloner) {


61  return new LinearRegression(this, cloner);


62  }


63 


64  #region linear regression


65  protected override void Run(CancellationToken cancellationToken) {


66  double rmsError, cvRmsError;


67  // produce both solutions, to allow symbolic manipulation of LR solutions as well


68  // as the calculation of prediction intervals.


69  // There is no clean way to implement the new model class for LR as a symbolic model.


70  var solution = CreateSolution(Problem.ProblemData, out rmsError, out cvRmsError, out _);


71  #pragma warning disable 168, 3021


72  var symbolicSolution = CreateLinearRegressionSolution(Problem.ProblemData, out rmsError, out cvRmsError, out var statistics);


73  #pragma warning restore 168, 3021


74  Results.Add(new Result(SolutionResultName, "The linear regression solution.", symbolicSolution));


75  Results.Add(new Result(ConfidenceSolutionResultName, "Linear regression solution with parameter covariance matrix " +


76  "and calculation of prediction intervals", solution));


77  Results.Add(new Result("Root mean square error", "The root of the mean of squared errors of the linear regression solution on the training set.", new DoubleValue(rmsError)));


78  Results.Add(new Result("Estimated root mean square error (crossvalidation)", "The estimated root of the mean of squared errors of the linear regression solution via cross validation.", new DoubleValue(cvRmsError)));


79 


80  var predictorNames = Problem.ProblemData.AllowedInputVariables.Concat(new string[] { "<const>" }).ToArray();


81  Results.AddOrUpdateResult("Statistics", statistics.AsResultCollection(predictorNames));


82 


83  }


84 


85  [Obsolete("Use CreateSolution() instead")]


86  public static ISymbolicRegressionSolution CreateLinearRegressionSolution(IRegressionProblemData problemData, out double rmsError, out double cvRmsError, out Statistics statistics) {


87  IEnumerable<string> doubleVariables;


88  IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables;


89  double[,] inputMatrix;


90  PrepareData(problemData, out inputMatrix, out doubleVariables, out factorVariables);


91 


92  int nRows = inputMatrix.GetLength(0);


93  int nFeatures = inputMatrix.GetLength(1)  1;


94 


95  alglib.lrbuild(inputMatrix, nRows, nFeatures, out int retVal, out var lm, out var ar);


96  if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution");


97  rmsError = ar.rmserror;


98  cvRmsError = ar.cvrmserror;


99 


100  double[] coefficients = new double[nFeatures + 1]; // last coefficient is for the constant


101  alglib.lrunpack(lm, out coefficients, out nFeatures);


102 


103  // prepare inputmatrix (which has y as last column) for calculation of parameter statistics


104  // the last coefficient is the offset


105  var resid = new double[nRows];


106  for (int r = 0; r < nRows; r++) {


107  resid[r] = inputMatrix[r, nFeatures]  coefficients[nFeatures];


108  inputMatrix[r, nFeatures] = 1.0;


109  }


110  statistics = Statistics.CalculateParameterStatistics(inputMatrix, coefficients, resid);


111 


112  int nFactorCoeff = factorVariables.Sum(kvp => kvp.Value.Count());


113  int nVarCoeff = doubleVariables.Count();


114  var tree = LinearModelToTreeConverter.CreateTree(factorVariables, coefficients.Take(nFactorCoeff).ToArray(),


115  doubleVariables.ToArray(), coefficients.Skip(nFactorCoeff).Take(nVarCoeff).ToArray(),


116  @const: coefficients[nFeatures]);


117 


118  SymbolicRegressionSolution solution = new SymbolicRegressionSolution(


119  new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeLinearInterpreter(), parameterCovariance: statistics.CovMx),


120  (IRegressionProblemData)problemData.Clone());


121  solution.Model.Name = "Linear Regression Model";


122  solution.Name = "Linear Regression Solution";


123  return solution;


124  }


125 


126  public static IRegressionSolution CreateSolution(IRegressionProblemData problemData, out double rmsError, out double cvRmsError, out Statistics statistics) {


127  IEnumerable<string> doubleVariables;


128  IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables;


129  double[,] inputMatrix;


130  PrepareData(problemData, out inputMatrix, out doubleVariables, out factorVariables);


131 


132  int nRows = inputMatrix.GetLength(0);


133  int nFeatures = inputMatrix.GetLength(1)  1;


134 


135  alglib.lrbuild(inputMatrix, nRows, nFeatures, out int retVal, out var lm, out var ar);


136  if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution");


137  rmsError = ar.rmserror;


138  cvRmsError = ar.cvrmserror;


139 


140  // get parameters of the model


141  double[] w;


142  alglib.lrunpack(lm, out w, out _);


143 


144  // prepare inputmatrix (which has y as last column) for calculation of parameter statistics


145  // the last coefficient is the offset


146  var resid = new double[nRows];


147  for (int r = 0; r < nRows; r++) {


148  resid[r] = inputMatrix[r, nFeatures]  w[nFeatures];


149  inputMatrix[r, nFeatures] = 1.0;


150  }


151  statistics = Statistics.CalculateParameterStatistics(inputMatrix, w, resid);


152 


153  // ar.c is the covariation matrix, array[0..NVars,0..NVars].


154  var solution = new LinearRegressionModel(w, ar.c, cvRmsError, problemData.TargetVariable, doubleVariables, factorVariables)


155  .CreateRegressionSolution((IRegressionProblemData)problemData.Clone());


156  solution.Name = "Linear Regression Solution";


157  return solution;


158  }


159 


160  private static void PrepareData(IRegressionProblemData problemData,


161  out double[,] inputMatrix,


162  out IEnumerable<string> doubleVariables,


163  out IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables) {


164  var dataset = problemData.Dataset;


165  string targetVariable = problemData.TargetVariable;


166  IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables;


167  IEnumerable<int> rows = problemData.TrainingIndices;


168  doubleVariables = allowedInputVariables.Where(dataset.VariableHasType<double>);


169  var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>);


170  factorVariables = dataset.GetFactorVariableValues(factorVariableNames, rows);


171  double[,] binaryMatrix = dataset.ToArray(factorVariables, rows);


172  double[,] doubleVarMatrix = dataset.ToArray(doubleVariables.Concat(new string[] { targetVariable }), rows);


173  inputMatrix = binaryMatrix.HorzCat(doubleVarMatrix);


174 


175  if (inputMatrix.ContainsNanOrInfinity())


176  throw new NotSupportedException("Linear regression does not support NaN or infinity values in the input dataset.");


177  }


178  #endregion


179  }


180  }

