source: branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/Splines.cs @ 15465

Last change on this file since 15465 was 15465, checked in by gkronber, 4 years ago

Started a summary of findings.

File size: 40.8 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Diagnostics;
25using System.Linq;
26using System.Threading;
27using HeuristicLab.Common;
28using HeuristicLab.Core;
29using HeuristicLab.Data;
30using HeuristicLab.Optimization;
31using HeuristicLab.Parameters;
32using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
33using HeuristicLab.Problems.DataAnalysis;
34using HeuristicLab.Random;
35using MathNet.Numerics.Interpolation;
36using MathNet.Numerics.LinearAlgebra.Double;
37using MathNet.Numerics.LinearAlgebra.Double.Solvers;
38
39
40/*
41 * Experimenting with various spline implementations.
42 *
43 * There are many different variants of using splines.
44 *  - Spline Interpolation
45 *  - Natural Spline Interpolation
46 *  - Smoothing Splines (Reinsch)
47 *  - P-Splines
48 *  - B-Splines
49 *  - Regression Splines
50 *  - Penalized Regression Splines
51 *  -
52 *
53 *
54 *  Generally, a spline is a piecewise function whereby each piece connects two knots.
55 *  In almost all cases splines are univariate. There are extensions to more dimensions
56 *  but this is much less convenient.
57 *  A linear spline is a piecewise linear function. Usually the following constaints are enforced
58 *  for each knot point k and the two piecewise functions fl and fr
59 *  fl(k) = fr(k), fl(k)' = fr(k)', fl(k)''=0, fr(k)''=0, additionally constraints for
60 *  the boundary knots are often enforced.
61
62 *  Spline Interpolation solves the problem of finding a smooth interpolating function
63 *  which goes through all knots exactly.
64 *
65 *  In spline smoothing one tries to find a function minimizing the squared residuals
66 *  and maximizing smoothness. For the smoothing spline g the optimization objective is
67 *  || y - g(x) ||² + \lambda integrate (xmin : xmax) g(x)'' dx.
68 *  The roughness penality is the integral over the second derivative of the smoothing spline
69 *  over the whole domain of x. Lambda is used to balance the amount of smoothing.
70 *  If lambda is set to zero, the solution is an interpolating spline. If lambda is set to
71 *  infinity the solution is a maximally smooth function (= line).
72 *  Interestingly, the estimation of a smoothing spline is a linear operation. As a consequence
73 *  a lot of theory from linear models can be reused (e.g. cross-validation and generalized
74 *  cross-validation) with minimal additional effort. GCV can also be used to tune the
75 *  smoothing parameter automatically. Tuned algorithms are available to solve smoothing
76 *  spline estimation efficiently O(n) or O(m²n) where m is the number of knots and n the
77 *  number of data points.
78 *  It is possible to calculate confidence intervals for the smoothing spline and given
79 *  an error model subsequently also confidence intervals for predictions.
80 *
81 *  We are primarily interested in spline smoothing / regression.
82 *  Alglib and Math.NET provide several different implementations for interpolation
83 *  including polynomial (cubic), Hermite, Akima, Linear, Catmull-Rom, Rational, ...
84 *
85 *  For spline smoothing or regression there are also many differen variants.
86 *
87 *  Smoothing Splines by Reinsch (Reinsch, Smoothing by Spline Functions, 1967).
88 *  Pseudo-code for the algorithm is given in the paper. A small change in the algorithm
89 *  to improve convergence is discussed in (Reinsch, Smoothing by Spline Functions II, 1970)
90 *  The algorithm uses the properties of the smoothing matrix (bandedness) to calculate the
91 *  smoothing spline in O(n). Two parameters have to be set (S and rho). If the noise in the
92 *  data is known then both parameter can be set directly, otherwise S should be set to n-1
93 *  and rho should be found via cross-validation.
94 *  It is not easy to calculate a the CV (PRESS) or GCV in the approach by Reinsch and
95 *  therefore the algorithm is not ideal.
96 *  Wahba and colleagues (1990) introduced CV and GCV estimation.
97 *  Hutchinson and colleages improved the algorithm (Fortran implementation available as
98 *  algorithm 642.f from ACM).
99 *
100 *  CUBGCV (Algorithm 642 Hutchingson)
101 *  Several improvments over the Reinsch algorithm. In particular efficient calculation
102 *  of the
103 
104 *  Finbarr O'Sullivan BART
105 *  ...
106 *
107 *  B-Splines
108 *  ...
109 *
110 *  P-Splines (Eilers and Marx), use the B-Spline basis in combination with a penality
111 *  for large variations of subsequent B-Spline coefficients. Conceptually, if the
112 *  coefficients of neighbouring B-Splines are almost the same then the resulting smoothing
113 *  spline is less wiggly. It has been critizized that this is rather subjective.
114 *  P-Splines also allow calculation of CV and GCV for a given smoothing parameter which
115 *  can be used for optimization of lambda.
116 *  It is easy to use P-Splines within GAMs (no backfitting necessary).
117 *  The paper contains MATLAB code also for the calculation of the B-Spline basis.
118 *  The computational effort is O(?). The authors argue that it is efficient.
119 *  The number of knots is not critical it is possible to use every data point as knot.
120 *
121 *  Alglib: penalized regression spline
122 *  ...
123 *
124 *  Alglib: penalized regression spline with support for weights
125 *  ...
126 */
127
128namespace HeuristicLab.Algorithms.DataAnalysis.Experimental {
129  [Item("Splines", "")]
130  [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 102)]
131  [StorableClass]
132  public sealed class Splines : FixedDataAnalysisAlgorithm<IRegressionProblem> {
133    [StorableConstructor]
134    private Splines(bool deserializing) : base(deserializing) { }
135    [StorableHook(HookType.AfterDeserialization)]
136    private void AfterDeserialization() {
137    }
138
139    private Splines(Splines original, Cloner cloner)
140      : base(original, cloner) {
141    }
142    public override IDeepCloneable Clone(Cloner cloner) {
143      return new Splines(this, cloner);
144    }
145
146    public Splines()
147      : base() {
148      var validTypes = new ItemSet<StringValue>(
149        new[] {
150        "Monotone", "Akima", "Catmull-Rom", "Cubic", "Linear",
151          "Cubic - Natural (Math.NET)",
152          "Polynomial (Math.NET)",
153          "Rational (Math.NET)",
154          "LogLinear (Math.NET)",
155          "Common (Math.NET)",
156          "Smoothing Spline (Mine)",
157          "Smoothing Spline (Reinsch)",
158          "Smoothing Spline (Reinsch with automatic tolerance determination using LOOCV)",
159          "B-Spline Smoothing",
160          "Penalized Regression Spline (alglib)",
161          "CUBGCV",
162          "Finnbar O'Sullivan (sbart)"
163      }.Select(s => new StringValue(s)));
164
165      Parameters.Add(new ConstrainedValueParameter<StringValue>("Type", "The type of spline (as supported by alglib)", validTypes, validTypes.Last()));
166      Parameters.Add(new ValueParameter<DoubleValue>("Lambda", "Regularization parameter for smoothing splines (0..+inf)", new DoubleValue(100)));
167      Parameters.Add(new ValueParameter<DoubleValue>("StdDev (noise)", "Known error in y values. Used only be Reinsch Smoothing Splines", new DoubleValue(0.1)));
168      Problem = new RegressionProblem();
169    }
170
171
172    protected override void Run(CancellationToken cancellationToken) {
173      double[,] inputMatrix = Problem.ProblemData.Dataset.ToArray(Problem.ProblemData.AllowedInputVariables.Concat(new string[] { Problem.ProblemData.TargetVariable }), Problem.ProblemData.TrainingIndices);
174      if (inputMatrix.Cast<double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
175        throw new NotSupportedException("Splines does not support NaN or infinity values in the input dataset.");
176
177      var inputVars = Problem.ProblemData.AllowedInputVariables.ToArray();
178      if (inputVars.Length > 3) throw new ArgumentException();
179
180      var y = Problem.ProblemData.TargetVariableTrainingValues.ToArray();
181      if (inputVars.Length == 1) {
182
183        var x = Problem.ProblemData.Dataset.GetDoubleValues(inputVars.First(), Problem.ProblemData.TrainingIndices).ToArray();
184        alglib.spline1dinterpolant c;
185        var type = ((StringValue)Parameters["Type"].ActualValue).Value;
186        switch (type) {
187          case "Monotone":
188            alglib.spline1dbuildmonotone(x, y, out c);
189            AddAlglibSplineResult(c, inputVars);
190            break;
191          case "Akima":
192            alglib.spline1dbuildakima(x, y, out c); AddAlglibSplineResult(c, inputVars);
193            break;
194            ;
195          case "Catmull-Rom":
196            alglib.spline1dbuildcatmullrom(x, y, out c); AddAlglibSplineResult(c, inputVars);
197            break;
198
199          case "Cubic":
200            alglib.spline1dbuildcubic(x, y, out c); AddAlglibSplineResult(c, inputVars);
201            break;
202
203          case "Linear":
204            alglib.spline1dbuildlinear(x, y, out c); AddAlglibSplineResult(c, inputVars);
205            break;
206          case "Cubic - Natural (Math.NET)": {
207              var spline = MathNet.Numerics.Interpolation.CubicSpline.InterpolateNatural(x, y);
208              AddMathNetSplineResult(spline, inputVars);
209              break;
210            }
211          case "Common (Math.NET)": {
212              var spline = MathNet.Numerics.Interpolate.Common(x, y);
213              AddMathNetSplineResult(spline, inputVars);
214              break;
215            }
216          case "LogLinear (Math.NET)": {
217              var spline = MathNet.Numerics.Interpolate.LogLinear(x, y);
218              AddMathNetSplineResult(spline, inputVars);
219              break;
220            }
221          case "Polynomial (Math.NET)": {
222              var spline = MathNet.Numerics.Interpolate.Polynomial(x, y);
223              AddMathNetSplineResult(spline, inputVars);
224              break;
225            }
226          case "Rational (Math.NET)": {
227              var spline = MathNet.Numerics.Interpolate.RationalWithoutPoles(x, y);
228              AddMathNetSplineResult(spline, inputVars);
229              break;
230            }
231          case "Smoothing Spline (Mine)": {
232              CalculateSmoothingSpline(x, y, inputVars);
233              break;
234            }
235          case "Smoothing Spline (Reinsch)": {
236              CalculateSmoothingSplineReinsch(x, y, inputVars);
237              break;
238            }
239          case "Smoothing Spline (Reinsch with automatic tolerance determination using LOOCV)": {
240              double optTol, looRMSE;
241              var model = CalculateSmoothingSplineReinsch(x, y, inputVars, Problem.ProblemData.TargetVariable, out optTol, out looRMSE);
242              Results.Add(new Result("Solution", model.CreateRegressionSolution((IRegressionProblemData)Problem.ProblemData.Clone())));
243              Results.Add(new Result("Optimal tolerance", new DoubleValue(optTol)));
244              Results.Add(new Result("RMSE (LOO-CV)", new DoubleValue(looRMSE)));
245              break;
246            }
247          case "CUBGCV": {
248              CubicSplineGCV.CubGcvReport report;
249              var model =
250                CubicSplineGCV.CalculateCubicSpline(x, y,
251                Problem.ProblemData.TargetVariable, inputVars, out report);
252              var targetVar = Problem.ProblemData.TargetVariable;
253              var problemData = (IRegressionProblemData)Problem.ProblemData.Clone();
254              Results.Add(new Result("Solution", model.CreateRegressionSolution(problemData)));
255              Results.Add(new Result("GCV", new DoubleValue(report.generalizedCrossValidation)));
256              Results.Add(new Result("Estimated error variance", new DoubleValue(report.estimatedErrorVariance)));
257              Results.Add(new Result("Estimated RSS degrees of freedom", new DoubleValue(report.estimatedRSSDegreesOfFreedom)));
258              Results.Add(new Result("Estimated true mean squared error at data points", new DoubleValue(report.estimatedTrueMeanSquaredErrorAtDataPoints)));
259              Results.Add(new Result("Mean square of DF", new DoubleValue(report.meanSquareOfDf)));
260              Results.Add(new Result("Mean square residual", new DoubleValue(report.meanSquareResiudal)));
261              Results.Add(new Result("Smoothing parameter (optimized for GCV)", new DoubleValue(report.smoothingParameter)));
262              break;
263            }
264          case "Finnbar O'Sullivan (sbart)": {
265              SBART.SBART_Report report;
266              var lambda = ((IValueParameter<DoubleValue>)Parameters["Lambda"]).Value.Value; // controls extent of smoothing
267              var model =
268                SBART.CalculateSBART(x, y,
269                Problem.ProblemData.TargetVariable, inputVars, (float)lambda, out report);
270              var targetVar = Problem.ProblemData.TargetVariable;
271              var problemData = (IRegressionProblemData)Problem.ProblemData.Clone();
272              Results.Add(new Result("Solution", model.CreateRegressionSolution(problemData)));
273              Results.Add(new Result("GCV", new DoubleValue(report.gcv)));
274              Results.Add(new Result("Smoothing parameter (optimized for GCV)", new DoubleValue(report.smoothingParameter)));
275              break;
276
277            }
278          case "B-Spline Smoothing": {
279              BSplineSmoothing(x, y, inputVars);
280              break;
281            }
282          case "Penalized Regression Spline (alglib)": {
283              var lambda = ((IValueParameter<DoubleValue>)Parameters["Lambda"]).Value.Value; // controls extent of smoothing
284              var model = CalculatePenalizedRegressionSpline(x, y, lambda, Problem.ProblemData.TargetVariable, inputVars);
285              var targetVar = Problem.ProblemData.TargetVariable;
286              var problemData = (IRegressionProblemData)Problem.ProblemData.Clone();
287              Results.Add(new Result("Solution", model.CreateRegressionSolution(problemData)));
288
289              break;
290            }
291          default: throw new NotSupportedException();
292        }
293
294      }
295    }
296
297
298    public static IRegressionModel CalculatePenalizedRegressionSpline(double[] x, double[] y, double lambda, string targetVar, string[] inputVars) {
299      int info;
300      alglib.spline1dinterpolant interpolant;
301      alglib.spline1dfitreport rep;
302      int n = x.Length;
303      n = (int)Math.Max(50, 3 * Math.Sqrt(n));
304
305      alglib.spline1dfitpenalized(x, y, n, lambda, out info, out interpolant, out rep);
306      return new Spline1dModel(interpolant, targetVar, inputVars);
307    }
308
309    public static IRegressionEnsembleModel CalculatePenalizedRegressionSpline(double[] x, double[] y, double lambda, string targetVar, string[] inputVars,
310      out double avgTrainRMSE,
311      out double cvRMSE, out double[] predictions) {
312      int info;
313      alglib.spline1dinterpolant interpolant;
314      alglib.spline1dfitreport rep;
315      int n = x.Length;
316      n = (int)Math.Max(50, 3 * Math.Sqrt(n));
317
318      int folds = 10;
319      int foldSize = x.Length <= 30 ? 1 : x.Length / folds;
320      if (foldSize == 1) folds = x.Length;
321
322      double[] x_loo = new double[(folds - 1) * foldSize];
323      double[] y_loo = new double[(folds - 1) * foldSize];
324      double[] w_loo = Enumerable.Repeat(1.0, x_loo.Length).ToArray();
325
326      var models = new List<IRegressionModel>();
327      var sumTrainSE = 0.0;
328      int nTrain = 0;
329      var sumTestSE = 0.0;
330      int nTest = 0;
331
332      predictions = new double[y.Length];
333
334      for (int i = 0; i < folds; i++) {
335        if (i < folds - 1) {
336          //Console.WriteLine("Copy from {0} to {1} n = {2}", (i + 1) * foldSize, i * foldSize, (folds - i - 1) * foldSize);
337          Array.Copy(x, (i + 1) * foldSize, x_loo, i * foldSize, (folds - i - 1) * foldSize);
338          Array.Copy(y, (i + 1) * foldSize, y_loo, i * foldSize, (folds - i - 1) * foldSize);
339        }
340        alglib.spline1dfitpenalizedw(x_loo, y_loo, w_loo, n, lambda, out info, out interpolant, out rep);
341        //Debug.Assert(y_loo.All(yi => yi != 0.0));
342        //Debug.Assert(x_loo.All(xi => xi != 0.0));
343
344        // training error
345        for (int j = 0; j < x_loo.Length; j++) {
346          var y_pred = alglib.spline1dcalc(interpolant, x[j]);
347          double res = y[j] - y_pred;
348          sumTrainSE += res * res;
349          nTrain++;
350        }
351        // test
352        for (int j = i * foldSize; j < (i + 1) * foldSize; j++) {
353          predictions[j] = alglib.spline1dcalc(interpolant, x[j]);
354          double res = y[j] - predictions[j];
355          sumTestSE += res * res;
356          nTest++;
357        }
358
359        models.Add(new Spline1dModel(interpolant, targetVar, inputVars));
360
361        if (i < folds - 1) {
362          //Console.WriteLine("Copy from {0} to {1} n = {2}", i * foldSize, i * foldSize, foldSize);
363          // add current fold for next iteration
364          Array.Copy(x, i * foldSize, x_loo, i * foldSize, foldSize);
365          Array.Copy(y, i * foldSize, y_loo, i * foldSize, foldSize);
366        }
367      }
368
369      cvRMSE = Math.Sqrt(sumTestSE / nTest);
370      avgTrainRMSE = Math.Sqrt(sumTrainSE / nTrain);
371
372      return new RegressionEnsembleModel(models);
373    }
374
375    public void BSplineSmoothing(double[] x, double[] y, string[] inputVars) {
376      Array.Sort(x, y);
377      CubicSpline2(x, y, inputVars);
378    }
379
380    private struct SplineData {
381      public double a, b, c, d, x, y;
382    };
383
384
385    /*
386     * The procedure Quincunx, takes as arguments
387       the vectors u, v and w which are respectively the diagonal, the first supradiagonal
388       and the second supradiagonal of the banded matrix. The vector on
389       the LHS of the equation (80) is placed in q which contains the solution on the
390       completion of the procedure.
391    */
392    private void Quincunx(int n, MyArray<double> u, MyArray<double> v, MyArray<double> w, MyArray<double> q) {
393      // { factorisation}
394      u[-1] = 0;
395      u[0] = 0;
396      for (int j = 1; j <= n - 1; j++) {
397        u[j] = u[j] - u[j - 2] * Math.Pow(w[j - 2], 2) - u[j - 1] * Math.Pow(v[j - 1], 2);
398        v[j] = (v[j] - u[j - 1] * v[j - 1] * w[j - 1]) / u[j];
399        w[j] = w[j] / u[j];
400      }
401      // { forward substitution}
402      for (int j = 1; j <= n - 1; j++) {
403        q[j] = q[j] - v[j - 1] * q[j - 1] - w[j - 2] * q[j - 2];
404      }
405      for (int j = 1; j < n - 1; j++) {
406        q[j] = q[j] / u[j];
407      }
408      // { back substitution}
409      q[n + 1] = 0;
410      q[n] = 0;
411      for (int j = n - 1; j >= 1; j--) {
412        q[j] = q[j] - v[j] * q[j + 1] - w[j] * q[j + 2];
413      }
414    }
415
416    public void CubicSpline2(double[] x, double[] y, string[] inputVars) {
417      // see Pollock: Smoothing Splines
418      int n = x.Length;
419      double lambda = 1.0;
420      double[] sigma = Enumerable.Repeat(1.0, n).ToArray();
421      SplineData[] S = new SplineData[x.Length + 1];
422      for (int i = 0; i < x.Length; i++) {
423        S[i].x = x[i];
424        S[i].y = y[i];
425      }
426      S[x.Length].x = S[x.Length - 1].x;
427      S[x.Length].y = S[x.Length - 1].y;
428
429      // var
430      double[] h = new double[n],
431        r = new double[n],
432        f = new double[n],
433        p = new double[n];
434      var q = new MyArray<double>(-1, n + 3);
435      var u = new MyArray<double>(-1, n + 3);
436      var v = new MyArray<double>(-1, n + 3);
437      var w = new MyArray<double>(-1, n + 3);
438      double mu;
439
440      mu = 2 * (1 - lambda) / (3 * lambda);
441      h[0] = S[1].x - S[0].x;
442      r[0] = 3 / h[0];
443      for (int i = 1; i < n - 1; i++) {
444        h[i] = S[i + 1].x - S[i].x;
445        r[i] = 3 / h[i];
446        f[i] = -(r[i - 1] + r[i]);
447        p[i] = 2 * (S[i + 1].x - S[i - 1].x);
448        q[i] = 3 * (S[i + 1].y - S[i].y) / h[i] - 3 * (S[i].y - S[i - 1].y) / h[i - 1];
449      }
450      for (int i = 1; i < n; i++) {
451        u[i] = Math.Pow(r[i - 1], 2) * sigma[i - 1] + Math.Pow(f[i], 2) * sigma[i] + Math.Pow(r[i], 2) * sigma[i + 1];    // TODO Sigma 1..n
452        u[i] = mu * u[i] + p[i];
453        v[i] = f[i] * r[i] * sigma[i] + r[i] * f[i + 1] * sigma[i + 1];
454        v[i] = mu * v[i] + h[i];
455        w[i] = mu * r[i] * r[i + 1] * sigma[i + 1];
456      }
457      Quincunx(n, u, v, w, q);
458      // { Spline P arameters}
459      S[0].d = S[0].y - mu * r[0] * q[1] * sigma[0];
460      S[1].d = S[1].y - mu * (f[1] * q[1] + r[1] * q[2]) * sigma[0];
461      S[0].a = q[1] / (3 * h[0]);
462      S[0].b = 0;
463      S[0].c = (S[1].d - S[0].d) / h[0] - q[1] * h[0] / 3;
464      r[0] = 0;
465      for (int j = 1; j < n; j++) {
466        S[j].a = (q[j + 1] - q[j]) / (3 * h[j]);
467        S[j].b = q[j];
468        S[j].c = (q[j] + q[j - 1]) * h[j - 1] + S[j - 1].c;
469        S[j].d = r[j - 1] * q[j - 1] + f[j] * q[j] + r[j] * q[j + 1];
470        S[j].d = y[j] - mu * S[j].d * sigma[j];
471      }
472
473      //   { SmoothingSpline}
474    }
475
476    public void CalculateSmoothingSplineReinsch(double[] x, double[] y, string[] inputVars) {
477      double s = x.Length + 1;
478      double stdTol = ((IValueParameter<DoubleValue>)Parameters["StdDev (noise)"]).Value.Value; // controls extent of smoothing
479
480      var model = CalculateSmoothingSplineReinsch(x, y, inputVars, stdTol, s, Problem.ProblemData.TargetVariable);
481
482      Results.Add(new Result("Solution", model.CreateRegressionSolution((IRegressionProblemData)Problem.ProblemData.Clone())));
483    }
484
485
486    public static IRegressionEnsembleModel CalculateSmoothingSplineReinsch(double[] x, double[] y, double tol, string targetVar, string[] inputVars,
487      out double avgTrainRMSE,
488      out double cvRMSE) {
489      int info;
490
491      int folds = 10;
492      int foldSize = x.Length <= 30 ? 1 : x.Length / folds;
493      if (foldSize == 1) folds = x.Length;
494
495      double[] x_loo = new double[(folds - 1) * foldSize];
496      double[] y_loo = new double[(folds - 1) * foldSize];
497
498      var models = new List<IRegressionModel>();
499      var sumTrainSE = 0.0;
500      int nTrain = 0;
501      var sumTestSE = 0.0;
502      int nTest = 0;
503
504
505      for (int i = 0; i < folds; i++) {
506        if (i < folds - 1) {
507          Array.Copy(x, (i + 1) * foldSize, x_loo, i * foldSize, x.Length - ((i + 1) * foldSize) - 1);
508          Array.Copy(y, (i + 1) * foldSize, y_loo, i * foldSize, y.Length - ((i + 1) * foldSize) - 1);
509        }
510        var model = CalculateSmoothingSplineReinsch(x_loo, y_loo, inputVars, tol, ((folds - 1) * foldSize) - 1, targetVar);
511
512        // training error
513        for(int j = 0;j<x_loo.Length;j++) {
514          var y_pred = model.GetEstimatedValue(x[j]);
515          double res = y[j] - y_pred;
516          sumTrainSE += res * res;
517          nTrain++;
518        }
519        // test
520        for (int j = i * foldSize; j < (i + 1) * foldSize; j++) {
521          var y_pred = model.GetEstimatedValue(x[j]);
522          double res = y[j] - y_pred;
523          sumTestSE += res * res;
524          nTest++;
525        }
526
527        models.Add(model);
528
529        if (i < folds - 1) {
530          // add current fold for next iteration
531          Array.Copy(x, i * foldSize, x_loo, i * foldSize, foldSize);
532          Array.Copy(y, i * foldSize, y_loo, i * foldSize, foldSize);
533        }
534      }
535
536      cvRMSE = Math.Sqrt(sumTestSE / nTest);
537      avgTrainRMSE = Math.Sqrt(sumTrainSE / nTrain);
538
539      return new RegressionEnsembleModel(models);
540    }
541
542
543    // automatically determines tolerance using loo cv, SLOW!
544    public static IRegressionModel CalculateSmoothingSplineReinsch(double[] x, double[] y, string[] inputVars, string targetVar,
545      out double optTolerance, out double cvRMSE) {
546      var maxTolerance = y.StandardDeviation();
547      var minTolerance = maxTolerance * 1E-6;
548      optTolerance = maxTolerance;
549      var tol = maxTolerance;
550      cvRMSE = double.PositiveInfinity;
551      IRegressionModel bestModel = new ConstantModel(y.Average(), targetVar);
552      while (tol > minTolerance) {
553        double curCVRMSE;
554        double avgTrainRmse;
555        var model = Splines.CalculateSmoothingSplineReinsch(
556          x, y, tol, targetVar, inputVars, out avgTrainRmse, out curCVRMSE);
557
558        if (curCVRMSE < cvRMSE) {
559          cvRMSE = curCVRMSE;
560          optTolerance = tol;
561          bestModel = model;
562        }
563        tol *= 0.85;
564      }
565      return bestModel;
566    }
567
568    public static ReinschSmoothingSplineModel CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars, double stdTol, double s, string targetVar) {
569      var minX = xOrig.Min();
570      var maxX = xOrig.Max();
571      var range = maxX - minX;
572
573      double[] w = Enumerable.Repeat(stdTol, xOrig.Length).ToArray();
574      SortAndBin((double[])xOrig.Clone(), (double[])yOrig.Clone(), w, out xOrig, out yOrig, out w, scaling: false);
575
576      // See Smoothing by Spline Functions, Reinsch, 1967
577      // move x and y into an array indexed with 1..n to match indexing in Reinsch paper
578      int n = xOrig.Length;
579      var x = new MyArray<double>(1, xOrig);
580      var y = new MyArray<double>(1, yOrig);
581      var inv_dy = new MyArray<double>(1, w);
582
583      int n1 = 1;
584      int n2 = n;
585
586      // results
587      var a = new MyArray<double>(n1, n);
588      var b = new MyArray<double>(n1, n);
589      var c = new MyArray<double>(n1, n);
590      var d = new MyArray<double>(n1, n);
591
592      // smooth
593      {
594        int i, m1, m2; double e, f, f2, g = 0.0, h, p;
595        MyArray<double>
596          r = new MyArray<double>(n1 - 1, n + 2),
597          r1 = new MyArray<double>(n1 - 1, n + 2),
598          r2 = new MyArray<double>(n1 - 1, n + 2),
599          t = new MyArray<double>(n1 - 1, n + 2),
600          t1 = new MyArray<double>(n1 - 1, n + 2),
601          u = new MyArray<double>(n1 - 1, n + 2),
602          v = new MyArray<double>(n1 - 1, n + 2);
603        m1 = n1 - 1;
604        m2 = n2 + 1;
605        r[m1] = r[n1] = r1[n2] = r2[n2] = r2[m2] =
606          u[m1] = u[n1] = u[n2] = u[m2] = p = 0.0;
607        m1 = n1 + 1;
608        m2 = n2 - 1;
609        h = x[m1] - x[n1]; f = (y[m1] - y[n1]) / h;
610        for (i = m1; i <= m2; i++) {
611          g = h; h = x[i + 1] - x[i];
612          e = f; f = (y[i + 1] - y[i]) / h;
613          a[i] = f - e; t[i] = 2 * (g + h) / 3; t1[i] = h / 3;
614          r2[i] = inv_dy[i - 1] / g; r[i] = inv_dy[i + 1] / h;
615          r1[i] = -inv_dy[i] / g - inv_dy[i] / h;
616        }
617        for (i = m1; i <= m2; i++) {
618          b[i] = r[i] * r[i] + r1[i] * r1[i] + r2[i] * r2[i];
619          c[i] = r[i] * r1[i + 1] + r1[i] * r2[i + 1];
620          d[i] = r[i] * r2[i + 2];
621        }
622        f2 = -s;
623        next:
624        for (i = m1; i <= m2; i++) {
625          r1[i - 1] = f * r[i - 1]; r2[i - 2] = g * r[i - 2];
626          r[i] = 1 / (p * b[i] + t[i] - f * r1[i - 1] - g * r2[i - 2]);
627          u[i] = a[i] - r1[i - 1] * u[i - 1] - r2[i - 2] * u[i - 2];
628          f = p * c[i] + t1[i] - h * r1[i - 1]; g = h; h = d[i] * p;
629        }
630        for (i = m2; i >= m1; i--) {
631          u[i] = r[i] * u[i] - r1[i] * u[i + 1] - r2[i] * u[i + 2];
632        }
633        e = h = 0;
634        for (i = n1; i <= m2; i++) {
635          g = h; h = (u[i + 1] - u[i]) / (x[i + 1] - x[i]);
636          v[i] = (h - g) * inv_dy[i] * inv_dy[i]; e = e + v[i] * (h - g);
637        }
638        g = v[n2] = -h * inv_dy[n2] * inv_dy[n2]; e = e - g * h;
639        g = f2; f2 = e * p * p;
640        if (f2 >= s || f2 <= g) goto fin;
641        f = 0; h = (v[m1] - v[n1]) / (x[m1] - x[n1]);
642        for (i = m1; i <= m2; i++) {
643          g = h; h = (v[i + 1] - v[i]) / (x[i + 1] - x[i]);
644          g = h - g - r1[i - 1] * r[i - 1] - r2[i - 2] * r[i - 2];
645          f = f + g * r[i] * g; r[i] = g;
646        }
647        h = e - p * f; if (h <= 0) goto fin;
648        p = p + (s - f2) / ((Math.Sqrt(s / e) + p) * h); goto next;
649
650        fin:
651        for (i = n1; i <= n2; i++) {
652          a[i] = y[i] - p * v[i];
653          c[i] = u[i];
654        }
655        for (i = n1; i <= m2; i++) {
656          h = x[i + 1] - x[i];
657          d[i] = (c[i + 1] - c[i]) / (3 * h);
658          b[i] = (a[i + 1] - a[i]) / h - (h * d[i] + c[i]) * h;
659        }
660      }
661
662      // extrapolate for xx > x[n2]
663      b[b.Length] = b[b.Length - 1];
664      d[1] = 0;
665
666      return new ReinschSmoothingSplineModel(a, b, c, d, x, targetVar, inputVars);
667    }
668
669    private void CalculateSmoothingSpline(double[] x, double[] y, string[] inputVars) {
670      // see Smoothing and Non-Parametric Regression, Germán Rodríguez, 2001 2.3.1
671      double[] w = Enumerable.Repeat(1.0, x.Length).ToArray();  // weights necessary for sortAndBin but are ignored below (TODO)
672      SortAndBin(x, y, w, out x, out y, out w, scaling: false);
673      int n = x.Length;
674
675      SparseMatrix delta = new SparseMatrix(n - 2, n);
676      // double[,] delta = new double[n - 2, n];
677      //double[,] W = new double[n - 2, n - 2];
678      SparseMatrix W = new SparseMatrix(n - 2, n - 2);
679      Matrix WInvD = new DenseMatrix(n - 2, n);
680
681      // double[,] W_inv_D = new double[n - 2, n];
682      // double[,] K = new double[n, n];
683
684      // go over successive knots to determine distances and fill Delta and W
685      for (int i = 0; i < n - 2; i++) {
686        double h = x[i + 1] - x[i];
687        double h_next = x[i + 2] - x[i + 1];
688        delta[i, i] = 1.0 / h;
689        delta[i, i + 1] = -1.0 / h - 1.0 / h_next;
690        delta[i, i + 2] = 1.0 / h_next;
691        W[i, i] = (h + h_next) / 3;
692        if (i > 0) {
693          W[i - 1, i] =
694           W[i, i - 1] = h / 6.0;
695        }
696      }
697
698      // WInvD = W.Cholesky().Solve(delta);
699      var solvResult = W.TrySolveIterative(delta, WInvD, new MlkBiCgStab());
700
701      // alglib.ablas.rmatrixgemm(n - 2, n, n - 2, 1.0, W, 0, 0, 0, delta, 0, 0, 0, 1.0, W_inv_D, 0, 0); // W^-1(M = n-2, K = n-2) D(K = n-2, N=n)   
702      // alglib.ablas.rmatrixgemm(n, n, n - 2, 1.0, delta, 0, 0, 1, W_inv_D, 0, 0, 0, 1.0, K, 0, 0); // D(M=n-2, K=n)^T * W^-1D (K=n, N=n-2)
703
704      var K = delta.TransposeThisAndMultiply(WInvD);
705
706      double lambda = ((IValueParameter<DoubleValue>)Parameters["Lambda"]).Value.Value;
707
708      for (int i = 0; i < n; i++) {
709        for (int j = 0; j < n; j++) {
710          K[i, j] *= lambda;
711          if (i == j) K[i, j] += 1;
712        }
713      }
714
715      // solve for y
716      // double[] s;
717      // int solverInfo;
718      // alglib.densesolverreport solverRep;
719      // alglib.rmatrixsolve(K, n, y, out solverInfo, out solverRep, out s);
720
721      var s = K.Solve(new DenseVector(y)).ToArray();
722
723      Results.Add(new Result("Solution", new RegressionSolution(new SmoothingSplineModel(s, x, Problem.ProblemData.TargetVariable, inputVars),
724  (IRegressionProblemData)Problem.ProblemData.Clone())));
725    }
726
727    public static void SortAndBin(double[] x, double[] y, double[] w, out double[] x2, out double[] y2, out double[] w2, bool scaling = false) {
728      var sortedIdx = Enumerable.Range(0, x.Length).ToArray();
729      // sort by x
730      Array.Sort(x, sortedIdx);
731
732      var xl = new List<double>();
733      var yl = new List<double>();
734      var wl = new List<double>();
735
736      int n = x.Length;
737      var range = x[n - 1] - x[0];
738      var binSize = range / n / 10;
739      {
740        // binning
741        int i = 0;
742        while (i < n) {
743          int k = 0;
744          int j = i;
745          double sumX = 0.0;
746          double sumY = 0.0;
747          double sumW = 0.0;
748          while (j < n && x[j] - x[i] <= binSize) {
749            k++;
750            sumX += x[j];
751            sumY += y[sortedIdx[j]];
752            sumW += w[sortedIdx[j]];
753            j++;
754          }
755          var avgX = sumX / k;
756          if (scaling) avgX = (avgX - x[0]) / range;
757          xl.Add(avgX);
758          yl.Add(sumY / k);
759          wl.Add(sumW);
760          i += k;
761        }
762      }
763
764      x2 = xl.ToArray();
765      y2 = yl.ToArray();
766      w2 = wl.ToArray();
767    }
768
769    private void AddAlglibSplineResult(alglib.spline1dinterpolant c, string[] inputVars) {
770      Results.Add(new Result("Solution", new RegressionSolution(new Spline1dModel(c, Problem.ProblemData.TargetVariable, inputVars),
771        (IRegressionProblemData)Problem.ProblemData.Clone())));
772
773    }
774    private void AddMathNetSplineResult(IInterpolation c, string[] inputVars) {
775      Results.Add(new Result("Solution", new RegressionSolution(new MathNetSplineModel(c, Problem.ProblemData.TargetVariable, inputVars),
776        (IRegressionProblemData)Problem.ProblemData.Clone())));
777    }
778  }
779
780
781  // array with non-zero lower index
782  internal class MyArray<T> {
783    private T[] arr;
784    private int lowerBound;
785
786    public int Length { get { return arr.Length; } }
787
788    public T this[int key] {
789      get {
790        return arr[key - lowerBound];
791      }
792      set {
793        arr[key - lowerBound] = value;
794      }
795    }
796
797    public MyArray(int lowerBound, int numElements) {
798      this.lowerBound = lowerBound;
799      arr = new T[numElements];
800    }
801    public MyArray(int lowerBound, T[] source) : this(lowerBound, source.Length) {
802      Array.Copy(source, arr, source.Length);
803    }
804
805    public T[] ToArray() {
806      var res = new T[arr.Length];
807      Array.Copy(arr, res, res.Length);
808      return res;
809    }
810  }
811
812
813  // UNFINISHED
814  public class ReinschSmoothingSplineModel : NamedItem, IRegressionModel {
815    private MyArray<double> a;
816    private MyArray<double> b;
817    private MyArray<double> c;
818    private MyArray<double> d;
819    private MyArray<double> x;
820    private double offset;
821    private double scale;
822
823    public string TargetVariable { get; set; }
824
825    public IEnumerable<string> VariablesUsedForPrediction { get; private set; }
826
827    public event EventHandler TargetVariableChanged;
828
829    public ReinschSmoothingSplineModel(ReinschSmoothingSplineModel orig, Cloner cloner) : base(orig, cloner) {
830      this.TargetVariable = orig.TargetVariable;
831      this.VariablesUsedForPrediction = orig.VariablesUsedForPrediction.ToArray();
832      this.a = orig.a;
833      this.b = orig.b;
834      this.c = orig.c;
835      this.d = orig.d;
836      this.x = orig.x;
837      this.scale = orig.scale;
838      this.offset = orig.offset;
839    }
840    internal ReinschSmoothingSplineModel(
841      MyArray<double> a,
842      MyArray<double> b,
843      MyArray<double> c,
844      MyArray<double> d,
845      MyArray<double> x,
846      string targetVar, string[] inputs, double offset = 0, double scale = 1) : base("SplineModel", "SplineModel") {
847      this.a = a;
848      this.b = b;
849      this.c = c;
850      this.d = d;
851      this.x = x;
852      this.TargetVariable = targetVar;
853      this.VariablesUsedForPrediction = inputs;
854      this.scale = scale;
855      this.offset = offset;
856    }
857
858    public override IDeepCloneable Clone(Cloner cloner) {
859      return new ReinschSmoothingSplineModel(this, cloner);
860    }
861
862    public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
863      return new RegressionSolution(this, (IRegressionProblemData)problemData.Clone());
864    }
865
866    public double GetEstimatedValue(double xx) {
867      xx = (xx - offset) * scale;
868      int n = a.Length;
869      if (xx <= x[1]) {
870        double h = xx - x[1];
871        return a[1] + h * (b[1] + h * (c[1] + h * d[1]));
872      } else if (xx >= x[n]) {
873        double h = xx - x[n];
874        return a[n] + h * (b[n] + h * (c[n] + h * d[n]));
875      } else {
876        // binary search
877        int lower = 1;
878        int upper = n;
879        while (true) {
880          if (upper < lower) throw new InvalidProgramException();
881          int i = lower + (upper - lower) / 2;
882          if (x[i] <= xx && xx < x[i + 1]) {
883            double h = xx - x[i];
884            return a[i] + h * (b[i] + h * (c[i] + h * d[i]));
885          } else if (xx < x[i]) {
886            upper = i - 1;
887          } else {
888            lower = i + 1;
889          }
890        }
891      }
892    }
893
894    public IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
895      int n = x.Length;
896      var product = dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows).ToArray();
897      foreach (var factor in VariablesUsedForPrediction.Skip(1)) {
898        product = product.Zip(dataset.GetDoubleValues(factor, rows), (pi, fi) => pi * fi).ToArray();
899      }
900      foreach (var xx in product) {
901        yield return GetEstimatedValue(xx);
902      }
903    }
904  }
905
906  // UNFINISHED
907  public class SmoothingSplineModel : NamedItem, IRegressionModel {
908    private double[] s;
909    private IInterpolation interpolation;
910
911    public string TargetVariable { get; set; }
912
913    public IEnumerable<string> VariablesUsedForPrediction { get; private set; }
914
915    public event EventHandler TargetVariableChanged;
916
917    public SmoothingSplineModel(SmoothingSplineModel orig, Cloner cloner) : base(orig, cloner) {
918      this.TargetVariable = orig.TargetVariable;
919      this.VariablesUsedForPrediction = orig.VariablesUsedForPrediction.ToArray();
920      this.s = orig.s; // TODO
921      this.interpolation = orig.interpolation;
922    }
923    public SmoothingSplineModel(double[] s, double[] x, string targetVar, string[] inputs) : base("SplineModel", "SplineModel") {
924      this.s = s;
925      this.TargetVariable = targetVar;
926      this.VariablesUsedForPrediction = inputs;
927      this.interpolation = MathNet.Numerics.Interpolate.CubicSpline(x, s);
928    }
929
930    public override IDeepCloneable Clone(Cloner cloner) {
931      return new SmoothingSplineModel(this, cloner);
932    }
933
934    public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
935      return new RegressionSolution(this, (IRegressionProblemData)problemData.Clone());
936    }
937
938    public IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
939      foreach (var x in dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows)) {
940
941        yield return interpolation.Interpolate(x);
942
943      }
944    }
945  }
946
947  // UNFINISHED
948  public class Spline1dModel : NamedItem, IRegressionModel {
949    private alglib.spline1dinterpolant interpolant;
950
951    public string TargetVariable { get; set; }
952
953    public IEnumerable<string> VariablesUsedForPrediction { get; private set; }
954
955    public event EventHandler TargetVariableChanged;
956
957    public Spline1dModel(Spline1dModel orig, Cloner cloner) : base(orig, cloner) {
958      this.TargetVariable = orig.TargetVariable;
959      this.VariablesUsedForPrediction = orig.VariablesUsedForPrediction.ToArray();
960      this.interpolant = (alglib.spline1dinterpolant)orig.interpolant.make_copy();
961    }
962    public Spline1dModel(alglib.spline1dinterpolant interpolant, string targetVar, string[] inputs) : base("SplineModel", "SplineModel") {
963      this.interpolant = interpolant;
964      this.TargetVariable = targetVar;
965      this.VariablesUsedForPrediction = inputs;
966    }
967
968    public override IDeepCloneable Clone(Cloner cloner) {
969      return new Spline1dModel(this, cloner);
970    }
971
972    public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
973      return new RegressionSolution(this, (IRegressionProblemData)problemData.Clone());
974    }
975
976    public IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
977      var product = dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows).ToArray();
978      foreach (var factor in VariablesUsedForPrediction.Skip(1)) {
979        product = product.Zip(dataset.GetDoubleValues(factor, rows), (pi, fi) => pi * fi).ToArray();
980      }
981
982      foreach (var x in product) {
983        yield return alglib.spline1dcalc(interpolant, x);
984      }
985    }
986  }
987
988
989  // UNFINISHED
990  public class MathNetSplineModel : NamedItem, IRegressionModel {
991    private IInterpolation interpolant;
992
993    public string TargetVariable { get; set; }
994
995    public IEnumerable<string> VariablesUsedForPrediction { get; private set; }
996
997    public event EventHandler TargetVariableChanged;
998
999    public MathNetSplineModel(MathNetSplineModel orig, Cloner cloner) : base(orig, cloner) {
1000      this.TargetVariable = orig.TargetVariable;
1001      this.VariablesUsedForPrediction = orig.VariablesUsedForPrediction.ToArray();
1002      this.interpolant = orig.interpolant; // TODO COPY!
1003    }
1004    public MathNetSplineModel(IInterpolation interpolant, string targetVar, string[] inputs) : base("SplineModel", "SplineModel") {
1005      this.interpolant = interpolant;
1006      this.TargetVariable = targetVar;
1007      this.VariablesUsedForPrediction = inputs;
1008    }
1009
1010    public override IDeepCloneable Clone(Cloner cloner) {
1011      return new MathNetSplineModel(this, cloner);
1012    }
1013
1014    public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
1015      return new RegressionSolution(this, (IRegressionProblemData)problemData.Clone());
1016    }
1017
1018    public IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) {
1019      foreach (var x in dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows)) {
1020        yield return interpolant.Interpolate(x);
1021      }
1022    }
1023  }
1024}
Note: See TracBrowser for help on using the repository browser.