Changeset 15365


Ignore:
Timestamp:
09/17/17 18:06:27 (2 years ago)
Author:
gkronber
Message:

#2789 experiments with spline smoothing and additive models

Location:
branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/GAM.cs

    r15364 r15365  
    109109
    110110        rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors
    111         rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation());
     111        rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation());
     112
     113        if (cancellationToken.IsCancellationRequested) break;
    112114      }
    113115
     
    135137    private IRegressionModel RegressLR(IRegressionProblemData problemData, string inputVar, double[] target) {
    136138      // Umständlich!
    137       var ds = new Dataset(new string[] { inputVar, "y" }, new[] { problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToList(), target.ToList() });
    138       var pd = new RegressionProblemData(ds, new string[] { inputVar }, "y");
     139      var ds = ((Dataset)problemData.Dataset).ToModifiable();
     140      ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList<double>());
     141      var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable);
     142      pd.TrainingPartition.Start = problemData.TrainingPartition.Start;
     143      pd.TrainingPartition.End = problemData.TrainingPartition.End;
     144      pd.TestPartition.Start = problemData.TestPartition.Start;
     145      pd.TestPartition.End = problemData.TestPartition.End;
    139146      double rmsError, cvRmsError;
    140147      return LinearRegression.CreateLinearRegressionSolution(pd, out rmsError, out cvRmsError).Model;
     
    142149
    143150    private IRegressionModel RegressSpline(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) {
    144       // Umständlich!
    145       return Splines.CalculateSmoothingSplineReinsch(
    146         problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToArray(),
    147         (double[])target.Clone(),
    148         new string[] { inputVar },
    149         s: lambda,
    150         targetVar: problemData.TargetVariable);
    151     }
    152 
     151      if (problemData.Dataset.VariableHasType<double>(inputVar)) {
     152        // Umständlich!
     153        return Splines.CalculateSmoothingSplineReinschWithAutomaticTolerance(
     154          problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToArray(),
     155          (double[])target.Clone(),
     156          new string[] { inputVar },
     157          targetVar: problemData.TargetVariable);
     158      } else return new ConstantModel(target.Average(), problemData.TargetVariable);
     159    }
     160    private IRegressionModel RegressRF(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) {
     161      if (problemData.Dataset.VariableHasType<double>(inputVar)) {
     162        // Umständlich!
     163        var ds = ((Dataset)problemData.Dataset).ToModifiable();
     164        ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList<double>());
     165        var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable);
     166        pd.TrainingPartition.Start = problemData.TrainingPartition.Start;
     167        pd.TrainingPartition.End = problemData.TrainingPartition.End;
     168        pd.TestPartition.Start = problemData.TestPartition.Start;
     169        pd.TestPartition.End = problemData.TestPartition.End;
     170        double rmsError, oobRmsError;
     171        double avgRelError, oobAvgRelError;
     172        return RandomForestRegression.CreateRandomForestRegressionModel(pd, 100, 0.5, 0.5, 1234, out rmsError, out avgRelError, out oobRmsError, out oobAvgRelError);
     173      } else return new ConstantModel(target.Average(), problemData.TargetVariable);
     174    }
    153175  }
    154176
  • branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/Splines.cs

    r15364 r15365  
    3636using HeuristicLab.Problems.DataAnalysis.Symbolic;
    3737using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression;
     38using HeuristicLab.Random;
    3839using MathNet.Numerics.Interpolation;
    3940using MathNet.Numerics.LinearAlgebra.Double;
     
    7071          "Smoothing Spline (Mine)",
    7172          "Smoothing Spline (Reinsch)",
     73          "Smoothing Spline (Reinsch with automatic tolerance determination using CV)",
     74          "B-Spline Smoothing"
    7275      }.Select(s => new StringValue(s)));
    7376
    7477      Parameters.Add(new ConstrainedValueParameter<StringValue>("Type", "The type of spline (as supported by alglib)", validTypes, validTypes.First()));
    7578      Parameters.Add(new ValueParameter<DoubleValue>("Lambda", "Regularization parameter for smoothing splines (0..+inf)", new DoubleValue(100)));
     79      Parameters.Add(new ValueParameter<DoubleValue>("StdDev (noise)", "Known error in y values. Used only be Reinsch Smoothing Splines", new DoubleValue(0.1)));
    7680      Problem = new RegressionProblem();
    7781    }
     
    145149              break;
    146150            }
     151          case "Smoothing Spline (Reinsch with automatic tolerance determination using CV)": {
     152              var model = CalculateSmoothingSplineReinschWithAutomaticTolerance(x, y, inputVars, Problem.ProblemData.TargetVariable);
     153              Results.Add(new Result("Solution", new RegressionSolution(model, (IRegressionProblemData)Problem.ProblemData.Clone())));
     154              break;
     155            }
     156          case "B-Spline Smoothing": {
     157              BSplineSmoothing(x, y, inputVars);
     158              break;
     159            }
    147160          default: throw new NotSupportedException();
    148161        }
     
    151164    }
    152165
    153     // array with non-zero lower index
    154     private class MyArray<T> {
    155       private T[] arr;
    156       private int lowerBound;
    157 
    158       public T this[int key] {
    159         get {
    160           return arr[key - lowerBound];
    161         }
    162         set {
    163           arr[key - lowerBound] = value;
    164         }
    165       }
    166 
    167       public MyArray(int lowerBound, int numElements) {
    168         this.lowerBound = lowerBound;
    169         arr = new T[numElements];
    170       }
    171       public MyArray(int lowerBound, T[] source) : this(lowerBound, source.Length) {
    172         Array.Copy(source, arr, source.Length);
    173       }
    174 
    175       public T[] ToArray() {
    176         var res = new T[arr.Length];
    177         Array.Copy(arr, res, res.Length);
    178         return res;
    179       }
     166    public void BSplineSmoothing(double[] x, double[] y, string[] inputVars) {
     167      Array.Sort(x, y);
     168
     169      CubicSpline2(x, y, inputVars);
     170
     171      //  // see Elements of Statistical Learning Appendix: Computations for Splines
     172      //  int M = 4; // order of spline (cubic)
     173      //  int K = x.Length;
     174      //  int n = x.Length;
     175      //
     176      //  double[] t = new double[K + 2 * M];
     177      //  for (int i = 0; i < M; i++) t[i] = x[0];
     178      //  for (int i = K; i < K + 2 * M; i++) t[i] = x[K - 1];
     179      //
     180      //  double[,] B = new double[n, n + 4]; // basis function matrix
     181      //  double[,] W = new double[n + 4, n + 4]; // penalty matrix (W[j,k] =
     182      //  for (int i = 0; i < M; i++) B[i] = new double[K + 2 * M];
     183      //  for (int j = 0; j < K; j++) {
     184      //    for (int i = 0; i < K + 2M - 1; i++) {
     185      //      if (t[i] <= x[j] && x[j] < t[i + 1])
     186      //        B[1][i] = 1.0;
     187      //    }
     188      //  }
     189
     190
     191    }
     192
     193    private struct SplineData {
     194      public double a, b, c, d, x, y;
     195    };
     196
     197
     198    /*
     199     * The procedure Quincunx, takes as arguments
     200       the vectors u, v and w which are respectively the diagonal, the first supradiagonal
     201       and the second supradiagonal of the banded matrix. The vector on
     202       the LHS of the equation (80) is placed in q which contains the solution on the
     203       completion of the procedure.
     204    */
     205    private void Quincunx(int n, MyArray<double> u, MyArray<double> v, MyArray<double> w, MyArray<double> q) {
     206      // { factorisation}
     207      u[-1] = 0;
     208      u[0] = 0;
     209      for (int j = 1; j <= n - 1; j++) {
     210        u[j] = u[j] - u[j - 2] * Math.Pow(w[j - 2], 2) - u[j - 1] * Math.Pow(v[j - 1], 2);
     211        v[j] = (v[j] - u[j - 1] * v[j - 1] * w[j - 1]) / u[j];
     212        w[j] = w[j] / u[j];
     213      }
     214      // { forward substitution}
     215      for (int j = 1; j <= n - 1; j++) {
     216        q[j] = q[j] - v[j - 1] * q[j - 1] - w[j - 2] * q[j - 2];
     217      }
     218      for (int j = 1; j < n - 1; j++) {
     219        q[j] = q[j] / u[j];
     220      }
     221      // { back substitution}
     222      q[n + 1] = 0;
     223      q[n] = 0;
     224      for (int j = n - 1; j >= 1; j--) {
     225        q[j] = q[j] - v[j] * q[j + 1] - w[j] * q[j + 2];
     226      }
     227    }
     228
     229    public void CubicSpline2(double[] x, double[] y, string[] inputVars) {
     230      // see Pollock: Smoothing Splines
     231      int n = x.Length;
     232      double lambda = 1.0;
     233      double[] sigma = Enumerable.Repeat(1.0, n).ToArray();
     234      SplineData[] S = new SplineData[x.Length + 1];
     235      for (int i = 0; i < x.Length; i++) {
     236        S[i].x = x[i];
     237        S[i].y = y[i];
     238      }
     239      S[x.Length].x = S[x.Length - 1].x;
     240      S[x.Length].y = S[x.Length - 1].y;
     241
     242      // var
     243      double[] h = new double[n],
     244        r = new double[n],
     245        f = new double[n],
     246        p = new double[n];
     247      var q = new MyArray<double>(-1, n + 3);
     248      var u = new MyArray<double>(-1, n + 3);
     249      var v = new MyArray<double>(-1, n + 3);
     250      var w = new MyArray<double>(-1, n + 3);
     251      double mu;
     252
     253      mu = 2 * (1 - lambda) / (3 * lambda);
     254      h[0] = S[1].x - S[0].x;
     255      r[0] = 3 / h[0];
     256      for (int i = 1; i < n - 1; i++) {
     257        h[i] = S[i + 1].x - S[i].x;
     258        r[i] = 3 / h[i];
     259        f[i] = -(r[i - 1] + r[i]);
     260        p[i] = 2 * (S[i + 1].x - S[i - 1].x);
     261        q[i] = 3 * (S[i + 1].y - S[i].y) / h[i] - 3 * (S[i].y - S[i - 1].y) / h[i - 1];
     262      }
     263      for (int i = 1; i < n; i++) {
     264        u[i] = Math.Pow(r[i - 1], 2) * sigma[i - 1] + Math.Pow(f[i], 2) * sigma[i] + Math.Pow(r[i], 2) * sigma[i + 1];    // TODO Sigma 1..n
     265        u[i] = mu * u[i] + p[i];
     266        v[i] = f[i] * r[i] * sigma[i] + r[i] * f[i + 1] * sigma[i + 1];
     267        v[i] = mu * v[i] + h[i];
     268        w[i] = mu * r[i] * r[i + 1] * sigma[i + 1];
     269      }
     270      Quincunx(n, u, v, w, q);
     271      // { Spline P arameters}
     272      S[0].d = S[0].y - mu * r[0] * q[1] * sigma[0];
     273      S[1].d = S[1].y - mu * (f[1] * q[1] + r[1] * q[2]) * sigma[0];
     274      S[0].a = q[1] / (3 * h[0]);
     275      S[0].b = 0;
     276      S[0].c = (S[1].d - S[0].d) / h[0] - q[1] * h[0] / 3;
     277      r[0] = 0;
     278      for (int j = 1; j < n; j++) {
     279        S[j].a = (q[j + 1] - q[j]) / (3 * h[j]);
     280        S[j].b = q[j];
     281        S[j].c = (q[j] + q[j - 1]) * h[j - 1] + S[j - 1].c;
     282        S[j].d = r[j - 1] * q[j - 1] + f[j] * q[j] + r[j] * q[j + 1];
     283        S[j].d = y[j] - mu * S[j].d * sigma[j];
     284      }
     285
     286      //   { SmoothingSpline}
    180287    }
    181288
    182289    public void CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars) {
    183       double s = ((IValueParameter<DoubleValue>)Parameters["Lambda"]).Value.Value; // controls extent of smoothing
    184 
    185       var model = CalculateSmoothingSplineReinsch(xOrig, yOrig, inputVars, s, Problem.ProblemData.TargetVariable);
     290      double s = xOrig.Length + 1;
     291      double stdTol = ((IValueParameter<DoubleValue>)Parameters["StdDev (noise)"]).Value.Value; // controls extent of smoothing
     292
     293      var model = CalculateSmoothingSplineReinsch(xOrig, yOrig, inputVars, stdTol, s, Problem.ProblemData.TargetVariable);
    186294
    187295      Results.Add(new Result("Solution", new RegressionSolution(model, (IRegressionProblemData)Problem.ProblemData.Clone())));
     
    190298    }
    191299
    192     public static IRegressionModel CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars, double s, string targetVar) {
     300
     301
     302    public static IRegressionModel CalculateSmoothingSplineReinschWithAutomaticTolerance(double[] xOrig, double[] yOrig, string[] inputVars, string targetVar) {
     303      int n = xOrig.Length;
     304      var rows = Enumerable.Range(0, n);
     305      var rand = new FastRandom(1234);
     306      var trainRows = rows.Shuffle(rand).Take((int)(n * 0.66)).ToArray();
     307      var testRows = rows.Except(trainRows).ToArray();
     308
     309      var trainX = trainRows.Select(i => xOrig[i]).ToArray();
     310      var trainY = trainRows.Select(i => yOrig[i]).ToArray();
     311      var testX = testRows.Select(i => xOrig[i]).ToArray();
     312      var testY = testRows.Select(i => yOrig[i]).ToArray();
     313
     314      var shuffledDs = new Dataset(inputVars.Concat(new string[] { targetVar }), new[] { trainX.Concat(testX).ToArray(), trainY.Concat(testY).ToArray() });
     315      var shuffeledProblemData = new RegressionProblemData(shuffledDs, inputVars, targetVar);
     316      shuffeledProblemData.TrainingPartition.Start = 0;
     317      shuffeledProblemData.TrainingPartition.End = trainX.Length;
     318      shuffeledProblemData.TestPartition.Start = trainX.Length;
     319      shuffeledProblemData.TestPartition.End = shuffledDs.Rows + 1;
     320
     321      double curTol = trainY.StandardDeviation();
     322      double prevTestRMSE = double.PositiveInfinity;
     323      double testRMSE = double.PositiveInfinity;
     324      IRegressionModel prevModel = null;
     325      IRegressionModel model = null;
     326      do {                                     
     327        prevModel = model;
     328        prevTestRMSE = testRMSE;
     329        model = CalculateSmoothingSplineReinsch(trainX, trainY, inputVars, curTol, s: trainX.Length + 1, targetVar: targetVar);
     330
     331        var sol = model.CreateRegressionSolution(shuffeledProblemData);
     332        var trainRMSE = sol.TrainingRootMeanSquaredError;
     333        testRMSE = sol.TestRootMeanSquaredError;
     334        curTol = Math.Max(curTol * 0.5, 1e-12 * trainY.StandardDeviation());
     335      } while (testRMSE < prevTestRMSE);
     336      return prevModel;
     337    }
     338
     339
     340    public static IRegressionModel CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars, double stdTol, double s, string targetVar) {
    193341      var minX = xOrig.Min();
    194342      var maxX = xOrig.Max();
    195343      var range = maxX - minX;
    196344
    197       SortAndBin(xOrig, yOrig, out xOrig, out yOrig, scaling: true);
     345      double[] w = Enumerable.Repeat(stdTol, xOrig.Length).ToArray();
     346      SortAndBin((double[])xOrig.Clone(), (double[])yOrig.Clone(), w, out xOrig, out yOrig, out w, scaling: false);
    198347
    199348      // See Smoothing by Spline Functions, Reinsch, 1967
     
    202351      var x = new MyArray<double>(1, xOrig);
    203352      var y = new MyArray<double>(1, yOrig);
    204       var inv_dy = new MyArray<double>(1, Enumerable.Repeat(1.0, n).ToArray()); // equal weights
     353      var inv_dy = new MyArray<double>(1, w);
    205354
    206355      int n1 = 1;
    207       int n2 = n - 1;
     356      int n2 = n;
    208357
    209358      // results
     
    283432      }
    284433
    285       return new ReinschSmoothingSplineModel(a.ToArray(), b.ToArray(), c.ToArray(), d.ToArray(), xOrig, targetVar, inputVars, minX, 1.0 / range);
     434      return new ReinschSmoothingSplineModel(a, b, c, d, x, targetVar, inputVars);
    286435    }
    287436
    288437    private void CalculateSmoothingSpline(double[] x, double[] y, string[] inputVars) {
    289438      // see Smoothing and Non-Parametric Regression, Germán Rodríguez, 2001 2.3.1
    290       SortAndBin(x, y, out x, out y, scaling: false);
     439      double[] w = Enumerable.Repeat(1.0, x.Length).ToArray();  // weights necessary for sortAndBin but are ignored below (TODO)
     440      SortAndBin(x, y, w, out x, out y, out w, scaling: false);
    291441      int n = x.Length;
    292442
     
    314464      }
    315465
     466      // WInvD = W.Cholesky().Solve(delta);
    316467      var solvResult = W.TrySolveIterative(delta, WInvD, new MlkBiCgStab());
    317468
     
    339490
    340491      Results.Add(new Result("Solution", new RegressionSolution(new SmoothingSplineModel(s, x, Problem.ProblemData.TargetVariable, inputVars),
    341  (IRegressionProblemData)Problem.ProblemData.Clone())));
    342     }
    343 
    344     private static void SortAndBin(double[] x, double[] y, out double[] x2, out double[] y2, bool scaling = false) {
    345       // sort y by x
    346       Array.Sort(x, y);
     492  (IRegressionProblemData)Problem.ProblemData.Clone())));
     493    }
     494
     495    private static void SortAndBin(double[] x, double[] y, double[] w, out double[] x2, out double[] y2, out double[] w2, bool scaling = false) {
     496      var sortedIdx = Enumerable.Range(0, x.Length).ToArray();
     497      // sort by x
     498      Array.Sort(x, sortedIdx);
    347499
    348500      var xl = new List<double>();
    349501      var yl = new List<double>();
     502      var wl = new List<double>();
    350503
    351504      int n = x.Length;
     
    360513          double sumX = 0.0;
    361514          double sumY = 0.0;
     515          double sumW = 0.0;
    362516          while (j < n && x[j] - x[i] <= binSize) {
    363517            k++;
    364518            sumX += x[j];
    365             sumY += y[j];
     519            sumY += y[sortedIdx[j]];
     520            sumW += w[sortedIdx[j]];
    366521            j++;
    367522          }
    368523          var avgX = sumX / k;
    369524          if (scaling) avgX = (avgX - x[0]) / range;
    370           xl.Add(avgX);     // scale
     525          xl.Add(avgX);
    371526          yl.Add(sumY / k);
     527          wl.Add(sumW);
    372528          i += k;
    373529        }
     
    376532      x2 = xl.ToArray();
    377533      y2 = yl.ToArray();
     534      w2 = wl.ToArray();
    378535    }
    379536
     
    390547
    391548
     549  // array with non-zero lower index
     550  internal class MyArray<T> {
     551    private T[] arr;
     552    private int lowerBound;
     553
     554    public int Length { get { return arr.Length; } }
     555
     556    public T this[int key] {
     557      get {
     558        return arr[key - lowerBound];
     559      }
     560      set {
     561        arr[key - lowerBound] = value;
     562      }
     563    }
     564
     565    public MyArray(int lowerBound, int numElements) {
     566      this.lowerBound = lowerBound;
     567      arr = new T[numElements];
     568    }
     569    public MyArray(int lowerBound, T[] source) : this(lowerBound, source.Length) {
     570      Array.Copy(source, arr, source.Length);
     571    }
     572
     573    public T[] ToArray() {
     574      var res = new T[arr.Length];
     575      Array.Copy(arr, res, res.Length);
     576      return res;
     577    }
     578  }
     579
    392580
    393581  // UNFINISHED
    394   public class ReinschSmoothingSplineModel : NamedItem, IRegressionModel {
    395     private double[] a;
    396     private double[] b;
    397     private double[] c;
    398     private double[] d;
    399     private double[] x;
     582  internal class ReinschSmoothingSplineModel : NamedItem, IRegressionModel {
     583    private MyArray<double> a;
     584    private MyArray<double> b;
     585    private MyArray<double> c;
     586    private MyArray<double> d;
     587    private MyArray<double> x;
    400588    private double offset;
    401589    private double scale;
     
    418606      this.offset = orig.offset;
    419607    }
    420     public ReinschSmoothingSplineModel(double[] a, double[] b, double[] c, double[] d, double[] x, string targetVar, string[] inputs, double offset, double scale) : base("SplineModel", "SplineModel") {
     608    public ReinschSmoothingSplineModel(
     609      MyArray<double> a,
     610      MyArray<double> b,
     611      MyArray<double> c,
     612      MyArray<double> d,
     613      MyArray<double> x,
     614      string targetVar, string[] inputs, double offset = 0, double scale = 1) : base("SplineModel", "SplineModel") {
    421615      this.a = a;
    422616      this.b = b;
     
    428622      this.scale = scale;
    429623      this.offset = offset;
     624
     625      // extrapolate for xx > x[n2]
     626      b[b.Length] = b[b.Length - 1];
     627      d[b.Length] = d[d.Length - 1];
    430628    }
    431629
     
    441639      int n = x.Length;
    442640      foreach (var xx in dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows).Select(xi => (xi - offset) * scale)) {
    443         if (xx < x[0]) yield return a[0];
    444         else if (xx >= x[n - 1]) yield return a[n - 2];
    445         else {
     641        if (xx <= x[1]) {
     642          double h = xx - x[1];
     643          yield return a[1] + h * (b[1] + h * (c[1] + h * d[1]));
     644        } else if (xx >= x[n]) {
     645          double h = xx - x[n];
     646          yield return a[n] + h * (b[n] + h * (c[n] + h * d[n]));
     647        } else {
    446648          // binary search
    447           int lower = 0;
    448           int upper = n - 1;
     649          int lower = 1;
     650          int upper = n;
    449651          while (true) {
    450652            if (upper < lower) throw new InvalidProgramException();
Note: See TracChangeset for help on using the changeset viewer.