Changeset 15744


Ignore:
Timestamp:
02/09/18 10:51:27 (20 months ago)
Author:
gkronber
Message:

#2892: implemented a new class LinearRegressionModel that produces variances for predictions (IConfidenceRegressionModel). Changed LR to produce a ConfidenceRegressionSolution (old static method is kept but marked obsolete)

Location:
branches/2892_LR-prediction-intervals/HeuristicLab.Algorithms.DataAnalysis/3.4
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • branches/2892_LR-prediction-intervals/HeuristicLab.Algorithms.DataAnalysis/3.4/HeuristicLab.Algorithms.DataAnalysis-3.4.csproj

    r15532 r15744  
    262262    <Compile Include="Linear\MultinomialLogitClassification.cs" />
    263263    <Compile Include="Linear\MultinomialLogitClassificationSolution.cs" />
     264    <Compile Include="Linear\LinearRegressionModel.cs" />
    264265    <Compile Include="Linear\MultinomialLogitModel.cs" />
    265266    <Compile Include="Linear\Scaling.cs" />
     
    320321    <Compile Include="TSNE\Distances\IndexedItemDistance.cs" />
    321322    <Compile Include="TSNE\Distances\ManhattanDistance.cs" />
    322   <Compile Include="TSNE\Distances\WeightedEuclideanDistance.cs" />
     323    <Compile Include="TSNE\Distances\WeightedEuclideanDistance.cs" />
    323324    <Compile Include="TSNE\Distances\IDistance.cs" />
    324325    <Compile Include="TSNE\PriorityQueue.cs" />
  • branches/2892_LR-prediction-intervals/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/LinearRegression.cs

    r15583 r15744  
    6262    protected override void Run(CancellationToken cancellationToken) {
    6363      double rmsError, cvRmsError;
    64       var solution = CreateLinearRegressionSolution(Problem.ProblemData, out rmsError, out cvRmsError);
     64      var solution = CreateSolution(Problem.ProblemData, out rmsError, out cvRmsError);
    6565      Results.Add(new Result(LinearRegressionModelResultName, "The linear regression solution.", solution));
    6666      Results.Add(new Result("Root mean square error", "The root of the mean of squared errors of the linear regression solution on the training set.", new DoubleValue(rmsError)));
     
    6868    }
    6969
     70    [Obsolete("Use CreateSolution() instead")]
    7071    public static ISymbolicRegressionSolution CreateLinearRegressionSolution(IRegressionProblemData problemData, out double rmsError, out double cvRmsError) {
    71       var dataset = problemData.Dataset;
    72       string targetVariable = problemData.TargetVariable;
    73       IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables;
    74       IEnumerable<int> rows = problemData.TrainingIndices;
    75       var doubleVariables = allowedInputVariables.Where(dataset.VariableHasType<double>);
    76       var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>);
    77       var factorVariables = dataset.GetFactorVariableValues(factorVariableNames, rows);
    78       double[,] binaryMatrix = dataset.ToArray(factorVariables, rows);
    79       double[,] doubleVarMatrix = dataset.ToArray(doubleVariables.Concat(new string[] { targetVariable }), rows);
    80       var inputMatrix = binaryMatrix.HorzCat(doubleVarMatrix);
    81 
    82       if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
    83         throw new NotSupportedException("Linear regression does not support NaN or infinity values in the input dataset.");
     72      IEnumerable<string> doubleVariables;
     73      IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables;
     74      double[,] inputMatrix;
     75      PrepareData(problemData, out inputMatrix, out doubleVariables, out factorVariables);
    8476
    8577      alglib.linearmodel lm = new alglib.linearmodel();
     
    8779      int nRows = inputMatrix.GetLength(0);
    8880      int nFeatures = inputMatrix.GetLength(1) - 1;
    89       double[] coefficients = new double[nFeatures + 1]; // last coefficient is for the constant
    9081
    9182      int retVal = 1;
     
    9586      cvRmsError = ar.cvrmserror;
    9687
     88      double[] coefficients = new double[nFeatures + 1]; // last coefficient is for the constant
    9789      alglib.lrunpack(lm, out coefficients, out nFeatures);
    98 
    99       int nFactorCoeff = binaryMatrix.GetLength(1);
     90     
     91      int nFactorCoeff = factorVariables.Sum(kvp=>kvp.Value.Count());
    10092      int nVarCoeff = doubleVariables.Count();
    10193      var tree = LinearModelToTreeConverter.CreateTree(factorVariables, coefficients.Take(nFactorCoeff).ToArray(),
     
    108100      return solution;
    109101    }
     102
     103    public static IRegressionSolution CreateSolution(IRegressionProblemData problemData, out double rmsError, out double cvRmsError) {
     104      IEnumerable<string> doubleVariables;
     105      IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables;
     106      double[,] inputMatrix;
     107      PrepareData(problemData, out inputMatrix, out doubleVariables, out factorVariables);
     108
     109      alglib.linearmodel lm = new alglib.linearmodel();
     110      alglib.lrreport ar = new alglib.lrreport();
     111      int nRows = inputMatrix.GetLength(0);
     112      int nFeatures = inputMatrix.GetLength(1) - 1;
     113
     114      int retVal = 1;
     115      alglib.lrbuild(inputMatrix, nRows, nFeatures, out retVal, out lm, out ar);
     116      if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution");
     117      rmsError = ar.rmserror;
     118      cvRmsError = ar.cvrmserror;
     119
     120      // get parameters of the model
     121      double[] w;
     122      int nVars;
     123      alglib.lrunpack(lm, out w, out nVars);
     124
     125      // ar.c is the covariation matrix,  array[0..NVars,0..NVars].
     126      // C[i, j] = Cov(A[i], A[j])
     127
     128      var solution = new LinearRegressionModel(w, ar.c, cvRmsError, problemData.TargetVariable, doubleVariables, factorVariables)
     129        .CreateRegressionSolution((IRegressionProblemData)problemData.Clone());
     130      solution.Name = "Linear Regression Solution";
     131      return solution;
     132    }
     133
     134    private static void PrepareData(IRegressionProblemData problemData,
     135      out double[,] inputMatrix,
     136      out IEnumerable<string> doubleVariables,
     137      out IEnumerable<KeyValuePair<string, IEnumerable<string>>> factorVariables) {
     138      var dataset = problemData.Dataset;
     139      string targetVariable = problemData.TargetVariable;
     140      IEnumerable<string> allowedInputVariables = problemData.AllowedInputVariables;
     141      IEnumerable<int> rows = problemData.TrainingIndices;
     142      doubleVariables = allowedInputVariables.Where(dataset.VariableHasType<double>);
     143      var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType<string>);
     144      factorVariables = dataset.GetFactorVariableValues(factorVariableNames, rows);
     145      double[,] binaryMatrix = dataset.ToArray(factorVariables, rows);
     146      double[,] doubleVarMatrix = dataset.ToArray(doubleVariables.Concat(new string[] { targetVariable }), rows);
     147      inputMatrix = binaryMatrix.HorzCat(doubleVarMatrix);
     148
     149      if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
     150        throw new NotSupportedException("Linear regression does not support NaN or infinity values in the input dataset.");
     151    }
    110152    #endregion
    111153  }
Note: See TracChangeset for help on using the changeset viewer.