Ignore:
Timestamp:
11/06/17 13:12:41 (4 years ago)
Author:
gkronber
Message:

#2789 more tests with CV and automatic determination of smoothing parameter

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/Splines.cs

    r15449 r15450  
    2222using System;
    2323using System.Collections.Generic;
     24using System.Diagnostics;
    2425using System.Linq;
    2526using System.Threading;
     
    156157          case "CUBGCV": {
    157158              CubicSplineGCV.CubGcvReport report;
    158               var model = 
    159                 CubicSplineGCV.CalculateCubicSpline(x, y, 
     159              var model =
     160                CubicSplineGCV.CalculateCubicSpline(x, y,
    160161                Problem.ProblemData.TargetVariable, inputVars, out report);
    161162              var targetVar = Problem.ProblemData.TargetVariable;
     
    165166              Results.Add(new Result("Estimated error variance", new DoubleValue(report.estimatedErrorVariance)));
    166167              Results.Add(new Result("Estimated RSS degrees of freedom", new DoubleValue(report.estimatedRSSDegreesOfFreedom)));
    167               Results.Add(new Result("Estimated treu mean squared error at data points", new DoubleValue(report.estimatedTrueMeanSquaredErrorAtDataPoints)));
     168              Results.Add(new Result("Estimated true mean squared error at data points", new DoubleValue(report.estimatedTrueMeanSquaredErrorAtDataPoints)));
    168169              Results.Add(new Result("Mean square of DF", new DoubleValue(report.meanSquareOfDf)));
    169170              Results.Add(new Result("Mean square residual", new DoubleValue(report.meanSquareResiudal)));
     
    204205    public static IRegressionEnsembleModel CalculatePenalizedRegressionSpline(double[] x, double[] y, double lambda, string targetVar, string[] inputVars,
    205206      out double avgTrainRMSE,
    206       out double loocvRMSE) {
     207      out double cvRMSE) {
    207208      int info;
    208209      alglib.spline1dinterpolant interpolant;
     
    211212      n = (int)Math.Max(50, 3 * Math.Sqrt(n));
    212213
    213       double[] x_loo = new double[x.Length - 1];
    214       double[] y_loo = new double[y.Length - 1];
     214      int folds = 10;
     215      int foldSize = x.Length <= 30 ? 1 : x.Length / folds;
     216      if (foldSize == 1) folds = x.Length;
     217
     218      double[] x_loo = new double[(folds - 1) * foldSize];
     219      double[] y_loo = new double[(folds - 1) * foldSize];
    215220
    216221      var models = new List<IRegressionModel>();
    217       var sumTrainRmse = 0.0;
    218       var sse = 0.0;
    219 
    220       for (int i = 0; i < x.Length; i++) {
    221         Array.Copy(x, 0, x_loo, 0, i);
    222         Array.Copy(x, i + 1, x_loo, i, x.Length - (i + 1));
    223         Array.Copy(y, 0, y_loo, 0, i);
    224         Array.Copy(y, i + 1, y_loo, i, y.Length - (i + 1));
     222      var sumTrainSE = 0.0;
     223      int nTrain = 0;
     224      var sumTestSE = 0.0;
     225      int nTest = 0;
     226
     227
     228      for (int i = 0; i < folds; i++) {
     229        if (i < folds - 1) {
     230          //Console.WriteLine("Copy from {0} to {1} n = {2}", (i + 1) * foldSize, i * foldSize, (folds - i - 1) * foldSize);
     231          Array.Copy(x, (i + 1) * foldSize, x_loo, i * foldSize, (folds - i - 1) * foldSize);
     232          Array.Copy(y, (i + 1) * foldSize, y_loo, i * foldSize, (folds - i - 1) * foldSize);
     233        }
    225234        alglib.spline1dfitpenalized(x_loo, y_loo, n, lambda, out info, out interpolant, out rep);
    226         var y_pred = alglib.spline1dcalc(interpolant, x[i]);
    227         double res = y[i] - y_pred;
    228         sse += res * res;
    229         sumTrainRmse += rep.rmserror;
    230 
    231         var model = new Spline1dModel(interpolant, targetVar, inputVars);
    232         models.Add(model);
    233       }
    234 
    235       loocvRMSE = Math.Sqrt(sse / x.Length);
    236       avgTrainRMSE = sumTrainRmse / x.Length;
     235        Debug.Assert(y_loo.All(yi => yi != 0.0));
     236        Debug.Assert(x_loo.All(xi => xi != 0.0));
     237
     238        // training error
     239        for (int j = 0; j < x_loo.Length; j++) {
     240          var y_pred = alglib.spline1dcalc(interpolant, x[i]);
     241          double res = y[j] - y_pred;
     242          sumTrainSE += res * res;
     243          nTrain++;
     244        }
     245        // test
     246        for (int j = i * foldSize; j < (i + 1) * foldSize; j++) {
     247          var y_pred = alglib.spline1dcalc(interpolant, x[i]);
     248          double res = y[j] - y_pred;
     249          sumTestSE += res * res;
     250          nTest++;
     251        }
     252
     253        models.Add(new Spline1dModel(interpolant, targetVar, inputVars));
     254
     255        if (i < folds - 1) {
     256          //Console.WriteLine("Copy from {0} to {1} n = {2}", i * foldSize, i * foldSize, foldSize);
     257          // add current fold for next iteration
     258          Array.Copy(x, i * foldSize, x_loo, i * foldSize, foldSize);
     259          Array.Copy(y, i * foldSize, y_loo, i * foldSize, foldSize);
     260        }
     261      }
     262
     263      cvRMSE = Math.Sqrt(sumTestSE / nTest);
     264      avgTrainRMSE = Math.Sqrt(sumTrainSE / nTrain);
    237265
    238266      return new RegressionEnsembleModel(models);
     
    352380    public static IRegressionEnsembleModel CalculateSmoothingSplineReinsch(double[] x, double[] y, double tol, string targetVar, string[] inputVars,
    353381      out double avgTrainRMSE,
    354       out double loocvRMSE) {
     382      out double cvRMSE) {
    355383      int info;
    356384
    357       double[] x_loo = new double[x.Length - 1];
    358       double[] y_loo = new double[y.Length - 1];
     385      int folds = 10;
     386      int foldSize = x.Length <= 30 ? 1 : x.Length / folds;
     387      if (foldSize == 1) folds = x.Length;
     388
     389      double[] x_loo = new double[(folds - 1) * foldSize];
     390      double[] y_loo = new double[(folds - 1) * foldSize];
    359391
    360392      var models = new List<IRegressionModel>();
    361       var sumTrainRmse = 0.0;
    362       var sse = 0.0;
    363 
    364       for (int i = 0; i < x.Length; i++) {
    365         Array.Copy(x, i + 1, x_loo, i, x.Length - (i + 1));
    366         Array.Copy(y, i + 1, y_loo, i, y.Length - (i + 1));
    367 
    368         var model = CalculateSmoothingSplineReinsch(x_loo, y_loo, inputVars, tol, x_loo.Length + 1, targetVar);
    369 
    370         var y_pred = model.GetEstimatedValue(x[i]);
    371         double res = y[i] - y_pred;
    372         sse += res * res;
     393      var sumTrainSE = 0.0;
     394      int nTrain = 0;
     395      var sumTestSE = 0.0;
     396      int nTest = 0;
     397
     398
     399      for (int i = 0; i < folds; i++) {
     400        if (i < folds - 1) {
     401          Array.Copy(x, (i + 1) * foldSize, x_loo, i * foldSize, x.Length - ((i + 1) * foldSize) - 1);
     402          Array.Copy(y, (i + 1) * foldSize, y_loo, i * foldSize, y.Length - ((i + 1) * foldSize) - 1);
     403        }
     404        var model = CalculateSmoothingSplineReinsch(x_loo, y_loo, inputVars, tol, ((folds - 1) * foldSize) - 1, targetVar);
     405
     406        // training error
     407        for(int j = 0;j<x_loo.Length;j++) {
     408          var y_pred = model.GetEstimatedValue(x[j]);
     409          double res = y[j] - y_pred;
     410          sumTrainSE += res * res;
     411          nTrain++;
     412        }
     413        // test
     414        for (int j = i * foldSize; j < (i + 1) * foldSize; j++) {
     415          var y_pred = model.GetEstimatedValue(x[j]);
     416          double res = y[j] - y_pred;
     417          sumTestSE += res * res;
     418          nTest++;
     419        }
    373420
    374421        models.Add(model);
    375422
    376         // add current row for next iteration
    377         if (i < x_loo.Length) {
    378           x_loo[i] = x[i];
    379           y_loo[i] = y[i];
    380         }
    381       }
    382 
    383       loocvRMSE = Math.Sqrt(sse / x.Length);
    384       avgTrainRMSE = sumTrainRmse / x.Length;
     423        if (i < folds - 1) {
     424          // add current fold for next iteration
     425          Array.Copy(x, i * foldSize, x_loo, i * foldSize, foldSize);
     426          Array.Copy(y, i * foldSize, y_loo, i * foldSize, foldSize);
     427        }
     428      }
     429
     430      cvRMSE = Math.Sqrt(sumTestSE / nTest);
     431      avgTrainRMSE = Math.Sqrt(sumTrainSE / nTrain);
    385432
    386433      return new RegressionEnsembleModel(models);
     
    388435
    389436
    390     // automatically determines tolerance using loo cv
     437    // automatically determines tolerance using loo cv, SLOW!
    391438    public static IRegressionModel CalculateSmoothingSplineReinsch(double[] x, double[] y, string[] inputVars, string targetVar,
    392       out double optTolerance, out double looRMSE) {
     439      out double optTolerance, out double cvRMSE) {
    393440      var maxTolerance = y.StandardDeviation();
    394441      var minTolerance = maxTolerance * 1E-6;
    395442      optTolerance = maxTolerance;
    396443      var tol = maxTolerance;
    397       looRMSE = double.PositiveInfinity;
     444      cvRMSE = double.PositiveInfinity;
    398445      IRegressionModel bestModel = new ConstantModel(y.Average(), targetVar);
    399446      while (tol > minTolerance) {
    400         double loo_rmse;
     447        double curCVRMSE;
    401448        double avgTrainRmse;
    402449        var model = Splines.CalculateSmoothingSplineReinsch(
    403           x, y, tol, targetVar, inputVars, out avgTrainRmse, out loo_rmse);
    404 
    405         if (loo_rmse < looRMSE) {
    406           looRMSE = loo_rmse;
     450          x, y, tol, targetVar, inputVars, out avgTrainRmse, out curCVRMSE);
     451
     452        if (curCVRMSE < cvRMSE) {
     453          cvRMSE = curCVRMSE;
    407454          optTolerance = tol;
    408455          bestModel = model;
Note: See TracChangeset for help on using the changeset viewer.