Changeset 15442


Ignore:
Timestamp:
10/31/17 20:01:02 (22 months ago)
Author:
gkronber
Message:

#2789: added cross-validation and automatic determination of regularization parameter (naive CV is slow!).

Research produced ACM Algorithm 642.f (Fortran) for univariate cubic spline smoothing with generalized cross-validation in O(n) described in HUTCHINSON, M. F., AND DE HOOG, F.R. Smoothing noisy data with spline functions. Numer. Math. 4 7 (1985), 99-106. (TODO: translate to C#)

Location:
branches/MathNetNumerics-Exploration-2789
Files:
1 added
4 edited

Legend:

Unmodified
Added
Removed
  • branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/GAM.cs

    r15436 r15442  
    218218          product = product.Zip(problemData.Dataset.GetDoubleValues(inputVars[i], problemData.TrainingIndices), (pi, vi) => pi * vi).ToArray();
    219219        }
    220         // Umständlich!
    221         return Splines.CalculatePenalizedRegressionSpline(
     220        double optTolerance, looRMSE;
     221        return Splines.CalculateSmoothingSplineReinsch(
    222222          product,
    223           (double[])target.Clone(), lambda,
    224           problemData.TargetVariable, inputVars
     223          (double[])target.Clone(), inputVars,
     224          problemData.TargetVariable,
     225          out optTolerance, out looRMSE
    225226          );
    226227      } else return new ConstantModel(target.Average(), problemData.TargetVariable);
  • branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/Splines.cs

    r15433 r15442  
    2121
    2222using System;
    23 using System.Collections.Concurrent;
    2423using System.Collections.Generic;
    2524using System.Linq;
    2625using System.Threading;
    27 using System.Threading.Tasks;
    2826using HeuristicLab.Common;
    2927using HeuristicLab.Core;
    3028using HeuristicLab.Data;
    31 using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
    3229using HeuristicLab.Optimization;
    3330using HeuristicLab.Parameters;
    3431using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
    3532using HeuristicLab.Problems.DataAnalysis;
    36 using HeuristicLab.Problems.DataAnalysis.Symbolic;
    37 using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;
    3833using HeuristicLab.Random;
    3934using MathNet.Numerics.Interpolation;
     
    7166          "Smoothing Spline (Mine)",
    7267          "Smoothing Spline (Reinsch)",
    73           "Smoothing Spline (Reinsch with automatic tolerance determination using CV)",
     68          "Smoothing Spline (Reinsch with automatic tolerance determination using LOOCV)",
    7469          "B-Spline Smoothing",
    7570          "Penalized Regression Spline (alglib)"
     
    150145              break;
    151146            }
    152           case "Smoothing Spline (Reinsch with automatic tolerance determination using CV)": {
    153               var model = CalculateSmoothingSplineReinschWithAutomaticTolerance(x, y, inputVars, Problem.ProblemData.TargetVariable);
    154               Results.Add(new Result("Solution", new RegressionSolution(model, (IRegressionProblemData)Problem.ProblemData.Clone())));
     147          case "Smoothing Spline (Reinsch with automatic tolerance determination using LOOCV)": {
     148              double optTol, looRMSE;
     149              var model = CalculateSmoothingSplineReinsch(x, y, inputVars, Problem.ProblemData.TargetVariable, out optTol, out looRMSE);
     150              Results.Add(new Result("Solution", model.CreateRegressionSolution((IRegressionProblemData)Problem.ProblemData.Clone())));
     151              Results.Add(new Result("Optimal tolerance", new DoubleValue(optTol)));
     152              Results.Add(new Result("RMSE (LOO-CV)", new DoubleValue(looRMSE)));
    155153              break;
    156154            }
     
    159157              break;
    160158            }
    161           case "Penalized Regression Spline (alglib)":
    162           {
     159          case "Penalized Regression Spline (alglib)": {
    163160              var lambda = ((IValueParameter<DoubleValue>)Parameters["Lambda"]).Value.Value; // controls extent of smoothing
    164161              var model = CalculatePenalizedRegressionSpline(x, y, lambda, Problem.ProblemData.TargetVariable, inputVars);
     
    168165
    169166              break;
    170           }
     167            }
    171168          default: throw new NotSupportedException();
    172169        }
     
    176173
    177174
    178 
    179     public static IRegressionModel CalculatePenalizedRegressionSpline(double[] x, double[] y, double lambda, string targetVar, string[] inputVars)
    180     {
     175    public static IRegressionModel CalculatePenalizedRegressionSpline(double[] x, double[] y, double lambda, string targetVar, string[] inputVars) {
    181176      int info;
    182177      alglib.spline1dinterpolant interpolant;
     
    189184    }
    190185
     186    public static IRegressionEnsembleModel CalculatePenalizedRegressionSpline(double[] x, double[] y, double lambda, string targetVar, string[] inputVars,
     187      out double avgTrainRMSE,
     188      out double loocvRMSE) {
     189      int info;
     190      alglib.spline1dinterpolant interpolant;
     191      alglib.spline1dfitreport rep;
     192      int n = x.Length;
     193      n = (int)Math.Max(50, 3 * Math.Sqrt(n));
     194
     195      double[] x_loo = new double[x.Length - 1];
     196      double[] y_loo = new double[y.Length - 1];
     197
     198      var models = new List<IRegressionModel>();
     199      var sumTrainRmse = 0.0;
     200      var sse = 0.0;
     201
     202      for (int i = 0; i < x.Length; i++) {
     203        Array.Copy(x, 0, x_loo, 0, i);
     204        Array.Copy(x, i + 1, x_loo, i, x.Length - (i + 1));
     205        Array.Copy(y, 0, y_loo, 0, i);
     206        Array.Copy(y, i + 1, y_loo, i, y.Length - (i + 1));
     207        alglib.spline1dfitpenalized(x_loo, y_loo, n, lambda, out info, out interpolant, out rep);
     208        var y_pred = alglib.spline1dcalc(interpolant, x[i]);
     209        double res = y[i] - y_pred;
     210        sse += res * res;
     211        sumTrainRmse += rep.rmserror;
     212
     213        var model = new Spline1dModel(interpolant, targetVar, inputVars);
     214        models.Add(model);
     215      }
     216
     217      loocvRMSE = Math.Sqrt(sse / x.Length);
     218      avgTrainRMSE = sumTrainRmse / x.Length;
     219
     220      return new RegressionEnsembleModel(models);
     221    }
     222
    191223    public void BSplineSmoothing(double[] x, double[] y, string[] inputVars) {
    192224      Array.Sort(x, y);
    193 
    194225      CubicSpline2(x, y, inputVars);
    195 
    196       //  // see Elements of Statistical Learning Appendix: Computations for Splines
    197       //  int M = 4; // order of spline (cubic)
    198       //  int K = x.Length;
    199       //  int n = x.Length;
    200       //
    201       //  double[] t = new double[K + 2 * M];
    202       //  for (int i = 0; i < M; i++) t[i] = x[0];
    203       //  for (int i = K; i < K + 2 * M; i++) t[i] = x[K - 1];
    204       //
    205       //  double[,] B = new double[n, n + 4]; // basis function matrix
    206       //  double[,] W = new double[n + 4, n + 4]; // penalty matrix (W[j,k] =
    207       //  for (int i = 0; i < M; i++) B[i] = new double[K + 2 * M];
    208       //  for (int j = 0; j < K; j++) {
    209       //    for (int i = 0; i < K + 2M - 1; i++) {
    210       //      if (t[i] <= x[j] && x[j] < t[i + 1])
    211       //        B[1][i] = 1.0;
    212       //    }
    213       //  }
    214 
    215 
    216226    }
    217227
     
    312322    }
    313323
    314     public void CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars) {
    315       double s = xOrig.Length + 1;
     324    public void CalculateSmoothingSplineReinsch(double[] x, double[] y, string[] inputVars) {
     325      double s = x.Length + 1;
    316326      double stdTol = ((IValueParameter<DoubleValue>)Parameters["StdDev (noise)"]).Value.Value; // controls extent of smoothing
    317327
    318       var model = CalculateSmoothingSplineReinsch(xOrig, yOrig, inputVars, stdTol, s, Problem.ProblemData.TargetVariable);
    319 
    320       Results.Add(new Result("Solution", new RegressionSolution(model, (IRegressionProblemData)Problem.ProblemData.Clone())));
    321 
    322 
    323     }
    324 
    325 
    326 
    327     public static IRegressionModel CalculateSmoothingSplineReinschWithAutomaticTolerance(double[] xOrig, double[] yOrig, string[] inputVars, string targetVar) {
    328       int n = xOrig.Length;
    329       var rows = Enumerable.Range(0, n);
    330       var rand = new FastRandom(1234);
    331       var trainRows = rows.Shuffle(rand).Take((int)(n * 0.66)).ToArray();
    332       var testRows = rows.Except(trainRows).ToArray();
    333 
    334       var trainX = trainRows.Select(i => xOrig[i]).ToArray();
    335       var trainY = trainRows.Select(i => yOrig[i]).ToArray();
    336       var testX = testRows.Select(i => xOrig[i]).ToArray();
    337       var testY = testRows.Select(i => yOrig[i]).ToArray();
    338 
    339       var shuffledDs = new Dataset(inputVars.Concat(new string[] { targetVar }), new[] { trainX.Concat(testX).ToArray(), trainY.Concat(testY).ToArray() });
    340       var shuffeledProblemData = new RegressionProblemData(shuffledDs, inputVars, targetVar);
    341       shuffeledProblemData.TrainingPartition.Start = 0;
    342       shuffeledProblemData.TrainingPartition.End = trainX.Length;
    343       shuffeledProblemData.TestPartition.Start = trainX.Length;
    344       shuffeledProblemData.TestPartition.End = shuffledDs.Rows + 1;
    345 
    346       double curTol = trainY.StandardDeviation();
    347       double prevTestRMSE = double.PositiveInfinity;
    348       double testRMSE = double.PositiveInfinity;
    349       IRegressionModel prevModel = null;
    350       IRegressionModel model = null;
    351       do {                                     
    352         prevModel = model;
    353         prevTestRMSE = testRMSE;
    354         model = CalculateSmoothingSplineReinsch(trainX, trainY, inputVars, curTol, s: trainX.Length + 1, targetVar: targetVar);
    355 
    356         var sol = model.CreateRegressionSolution(shuffeledProblemData);
    357         var trainRMSE = sol.TrainingRootMeanSquaredError;
    358         testRMSE = sol.TestRootMeanSquaredError;
    359         curTol = Math.Max(curTol * 0.5, 1e-12 * trainY.StandardDeviation());
    360       } while (testRMSE < prevTestRMSE);
    361       return prevModel;
    362     }
    363 
    364 
    365     public static IRegressionModel CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars, double stdTol, double s, string targetVar) {
     328      var model = CalculateSmoothingSplineReinsch(x, y, inputVars, stdTol, s, Problem.ProblemData.TargetVariable);
     329
     330      Results.Add(new Result("Solution", model.CreateRegressionSolution((IRegressionProblemData)Problem.ProblemData.Clone())));
     331    }
     332
     333
     334    public static IRegressionEnsembleModel CalculateSmoothingSplineReinsch(double[] x, double[] y, double tol, string targetVar, string[] inputVars,
     335      out double avgTrainRMSE,
     336      out double loocvRMSE) {
     337      int info;
     338
     339      double[] x_loo = new double[x.Length - 1];
     340      double[] y_loo = new double[y.Length - 1];
     341
     342      var models = new List<IRegressionModel>();
     343      var sumTrainRmse = 0.0;
     344      var sse = 0.0;
     345
     346      for (int i = 0; i < x.Length; i++) {
     347        Array.Copy(x, i + 1, x_loo, i, x.Length - (i + 1));
     348        Array.Copy(y, i + 1, y_loo, i, y.Length - (i + 1));
     349
     350        var model = CalculateSmoothingSplineReinsch(x_loo, y_loo, inputVars, tol, x_loo.Length + 1, targetVar);
     351
     352        var y_pred = model.GetEstimatedValue(x[i]);
     353        double res = y[i] - y_pred;
     354        sse += res * res;
     355
     356        models.Add(model);
     357
     358        // add current row for next iteration
     359        if (i < x_loo.Length) {
     360          x_loo[i] = x[i];
     361          y_loo[i] = y[i];
     362        }
     363      }
     364
     365      loocvRMSE = Math.Sqrt(sse / x.Length);
     366      avgTrainRMSE = sumTrainRmse / x.Length;
     367
     368      return new RegressionEnsembleModel(models);
     369    }
     370
     371
     372    // automatically determines tolerance using loo cv
     373    public static IRegressionModel CalculateSmoothingSplineReinsch(double[] x, double[] y, string[] inputVars, string targetVar,
     374      out double optTolerance, out double looRMSE) {
     375      var maxTolerance = y.StandardDeviation();
     376      var minTolerance = maxTolerance * 1E-6;
     377      optTolerance = maxTolerance;
     378      var tol = maxTolerance;
     379      looRMSE = double.PositiveInfinity;
     380      IRegressionModel bestModel = new ConstantModel(y.Average(), targetVar);
     381      while (tol > minTolerance) {
     382        double loo_rmse;
     383        double avgTrainRmse;
     384        var model = Splines.CalculateSmoothingSplineReinsch(
     385          x, y, tol, targetVar, inputVars, out avgTrainRmse, out loo_rmse);
     386
     387        if (loo_rmse < looRMSE) {
     388          looRMSE = loo_rmse;
     389          optTolerance = tol;
     390          bestModel = model;
     391        }
     392        tol *= 0.85;
     393      }
     394      return bestModel;
     395    }
     396
     397    public static ReinschSmoothingSplineModel CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars, double stdTol, double s, string targetVar) {
    366398      var minX = xOrig.Min();
    367399      var maxX = xOrig.Max();
     
    605637
    606638  // UNFINISHED
    607   internal class ReinschSmoothingSplineModel : NamedItem, IRegressionModel {
     639  public class ReinschSmoothingSplineModel : NamedItem, IRegressionModel {
    608640    private MyArray<double> a;
    609641    private MyArray<double> b;
     
    631663      this.offset = orig.offset;
    632664    }
    633     public ReinschSmoothingSplineModel(
     665    internal ReinschSmoothingSplineModel(
    634666      MyArray<double> a,
    635667      MyArray<double> b,
     
    650682      // extrapolate for xx > x[n2]
    651683      b[b.Length] = b[b.Length - 1];
    652       d[b.Length] = d[d.Length - 1];
     684      d[1] = 0;
     685      // d[b.Length] = -d[d.Length - 1];
    653686    }
    654687
     
    661694    }
    662695
     696    public double GetEstimatedValue(double xx) {
     697      int n = x.Length;
     698      if (xx <= x[1]) {
     699        double h = xx - x[1];
     700        return a[1] + h * (b[1] + h * (c[1] + h * d[1]));
     701      } else if (xx >= x[n]) {
     702        double h = xx - x[n];
     703        return a[n] + h * (b[n] + h * (c[n] + h * d[n]));
     704      } else {
     705        // binary search
     706        int lower = 1;
     707        int upper = n;
     708        while (true) {
     709          if (upper < lower) throw new InvalidProgramException();
     710          int i = lower + (upper - lower) / 2;
     711          if (x[i] <= xx && xx < x[i + 1]) {
     712            double h = xx - x[i];
     713            return a[i] + h * (b[i] + h * (c[i] + h * d[i]));
     714          } else if (xx < x[i]) {
     715            upper = i - 1;
     716          } else {
     717            lower = i + 1;
     718          }
     719        }
     720      }
     721    }
     722
    663723    public IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
    664724      int n = x.Length;
    665       foreach (var xx in dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows).Select(xi => (xi - offset) * scale)) {
    666         if (xx <= x[1]) {
    667           double h = xx - x[1];
    668           yield return a[1] + h * (b[1] + h * (c[1] + h * d[1]));
    669         } else if (xx >= x[n]) {
    670           double h = xx - x[n];
    671           yield return a[n] + h * (b[n] + h * (c[n] + h * d[n]));
    672         } else {
    673           // binary search
    674           int lower = 1;
    675           int upper = n;
    676           while (true) {
    677             if (upper < lower) throw new InvalidProgramException();
    678             int i = lower + (upper - lower) / 2;
    679             if (x[i] <= xx && xx < x[i + 1]) {
    680               double h = xx - x[i];
    681               yield return a[i] + h * (b[i] + h * (c[i] + h * d[i]));
    682               break;
    683             } else if (xx < x[i]) {
    684               upper = i - 1;
    685             } else {
    686               lower = i + 1;
    687             }
    688           }
    689         }
     725      var product = dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows).ToArray();
     726      foreach (var factor in VariablesUsedForPrediction.Skip(1)) {
     727        product = product.Zip(dataset.GetDoubleValues(factor, rows), (pi, fi) => pi * fi).ToArray();
     728      }
     729      foreach (var xx in product.Select(xi => (xi - offset) * scale)) {
     730        yield return GetEstimatedValue(xx);
    690731      }
    691732    }
     
    764805    public IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
    765806      var product = dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows).ToArray();
    766       foreach(var factor in VariablesUsedForPrediction.Skip(1)) {
     807      foreach (var factor in VariablesUsedForPrediction.Skip(1)) {
    767808        product = product.Zip(dataset.GetDoubleValues(factor, rows), (pi, fi) => pi * fi).ToArray();
    768809      }
  • branches/MathNetNumerics-Exploration-2789/Test/Test.csproj

    r15348 r15442  
    3636  </PropertyGroup>
    3737  <ItemGroup>
     38    <Reference Include="ALGLIB-3.7.0, Version=3.7.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL">
     39      <SpecificVersion>False</SpecificVersion>
     40      <HintPath>..\..\..\trunk\sources\bin\ALGLIB-3.7.0.dll</HintPath>
     41    </Reference>
     42    <Reference Include="HeuristicLab.Algorithms.DataAnalysis-3.4, Version=3.4.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec" />
    3843    <Reference Include="HeuristicLab.Algorithms.GeneticAlgorithm-3.3">
    3944      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Algorithms.GeneticAlgorithm-3.3.dll</HintPath>
  • branches/MathNetNumerics-Exploration-2789/Test/UnitTest1.cs

    r15313 r15442  
    11using System;
     2using System.Diagnostics;
     3using System.Linq;
    24using HeuristicLab.Algorithms.DataAnalysis.Experimental;
    35using HeuristicLab.Algorithms.GeneticAlgorithm;
     6using HeuristicLab.Common;
     7using HeuristicLab.Problems.DataAnalysis;
    48using HeuristicLab.SequentialEngine;
    59using Microsoft.VisualStudio.TestTools.UnitTesting;
     
    2226    }
    2327  }
     28
     29  [TestClass]
     30  public class PenalizedRegressionSplineTests {
     31    [TestMethod]
     32    public void TestPenalizedRegressionSplinesCrossValidation() {
     33      var xs = HeuristicLab.Common.SequenceGenerator.GenerateSteps(-3.5, 3.5, 0.02, includeEnd: true).ToArray();
     34      var ys = xs.Select(xi => alglib.normaldistr.normaldistribution(xi)); // 1.0 / (Math.Sqrt(2 * Math.PI) * Math.Exp(-0.5 * xi * xi))).ToArray();
     35
     36      alglib.hqrndstate state;
     37      alglib.hqrndseed(1234, 5678, out state);
     38      var ys_noise = ys.Select(yi => yi + alglib.hqrndnormal(state) * 0.01).ToArray();
     39
     40      double bestRho = -15;
     41      double best_loo_rmse = double.PositiveInfinity;
     42      var clock = new Stopwatch();
     43      clock.Start();
     44      int iters = 0;
     45      for (int rho = -15; rho <= 15; rho++) {
     46        double loo_rmse;
     47        double avgTrainRmse;
     48        Splines.CalculatePenalizedRegressionSpline(xs, ys_noise, rho, "y", new string[] { "x" }, out avgTrainRmse, out loo_rmse);
     49        iters++;
     50        Console.WriteLine("{0} {1} {2}", rho, avgTrainRmse, loo_rmse);
     51        if (loo_rmse < best_loo_rmse) {
     52          best_loo_rmse = loo_rmse;
     53          bestRho = rho;
     54        }
     55      }
     56      clock.Stop();
     57      Console.WriteLine("Best rho {0}, RMSE (LOO): {1}, ms/run {2}", bestRho, best_loo_rmse, clock.ElapsedMilliseconds / (double)iters);
     58    }
     59
     60    [TestMethod]
     61    public void TestReinschSplineCrossValidation() {
     62      var xs = HeuristicLab.Common.SequenceGenerator.GenerateSteps(-3.5, 3.5, 0.02, includeEnd: true).ToList();
     63      var ys = xs.Select(xi => alglib.normaldistr.normaldistribution(xi)); // 1.0 / (Math.Sqrt(2 * Math.PI) * Math.Exp(-0.5 * xi * xi))).ToArray();
     64
     65      alglib.hqrndstate state;
     66      alglib.hqrndseed(1234, 5678, out state);
     67      var ys_noise = ys.Select(yi => yi + alglib.hqrndnormal(state) * 0.01).ToList();
     68
     69      var tol = ys_noise.StandardDeviation();
     70
     71      var ds = new Dataset(new string[] { "x", "y" }, new[] { xs, ys_noise });
     72      var rows = Enumerable.Range(0, xs.Count);
     73
     74      var bestTol = double.PositiveInfinity;
     75      var bestLooRmse = double.PositiveInfinity;
     76      var clock = new Stopwatch();
     77      clock.Start();
     78      var iters = 0;
     79      while (tol > 0.0001) {
     80        double loo_rmse;
     81        double avgTrainRmse;
     82        var model = Splines.CalculateSmoothingSplineReinsch(
     83          xs.ToArray(), ys_noise.ToArray(), tol, "y", new string[] { "x" }, out avgTrainRmse, out loo_rmse);
     84        var y_pred = model.GetEstimatedValues(ds, rows);
     85
     86        Console.WriteLine("{0} {1} {2}", tol, avgTrainRmse, loo_rmse);
     87        if (loo_rmse < bestLooRmse) {
     88          bestLooRmse = loo_rmse;
     89          bestTol = tol;
     90        }
     91        tol *= 0.8;
     92        iters++;
     93      }
     94      clock.Stop();
     95      Console.WriteLine("Best tolerance {0}, RMSE (LOO): {1}, ms/run {2}", bestTol, bestLooRmse, clock.ElapsedMilliseconds / (double)iters);
     96    }
     97
     98    [TestMethod]
     99    public void TestReinschSplineAutomaticTolerance() {
     100      var xs = HeuristicLab.Common.SequenceGenerator.GenerateSteps(-3.5, 3.5, 0.02, includeEnd: true).ToList();
     101      var ys = xs.Select(xi => alglib.normaldistr.normaldistribution(xi)); // 1.0 / (Math.Sqrt(2 * Math.PI) * Math.Exp(-0.5 * xi * xi))).ToArray();
     102
     103      alglib.hqrndstate state;
     104      alglib.hqrndseed(1234, 5678, out state);
     105      var ys_noise = ys.Select(yi => yi + alglib.hqrndnormal(state) * 0.01).ToList();
     106
     107      double optTol;
     108      double looRMSE;
     109      Splines.CalculateSmoothingSplineReinsch(xs.ToArray(), ys_noise.ToArray(), new string[] { "x" }, "y", out optTol, out looRMSE);
     110
     111      Console.WriteLine("Best tolerance {0}, RMSE (LOO): {1}", optTol, looRMSE);
     112
     113    }
     114  }
    24115}
Note: See TracChangeset for help on using the changeset viewer.