Changeset 15450


Ignore:
Timestamp:
11/06/17 13:12:41 (3 years ago)
Author:
gkronber
Message:

#2789 more tests with CV and automatic determination of smoothing parameter

Location:
branches/MathNetNumerics-Exploration-2789
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/GAM.cs

    r15449 r15450  
    111111        nTerms += inputVars.Combinations(i).Count();
    112112      }
     113
    113114      IRegressionModel[] f = new IRegressionModel[nTerms];
    114115      for (int i = 0; i < f.Length; i++) {
     
    130131      string[] terms = new string[f.Length];
    131132      Results.Add(new Result("RSS Values", typeof(DoubleMatrix)));
     133
     134      var combinations = new List<string[]>();
     135      for(int i=1;i<=maxInteractions;i++)
     136        combinations.AddRange(HeuristicLab.Common.EnumerableExtensions.Combinations(inputVars, i).Select(c => c.ToArray()));
     137      // combinations.Add(new string[] { "X1", "X2" });
     138      // combinations.Add(new string[] { "X3", "X4" });
     139      // combinations.Add(new string[] { "X5", "X6" });
     140      // combinations.Add(new string[] { "X1", "X7", "X9" });
     141      // combinations.Add(new string[] { "X3", "X6", "X10" });
     142
     143
    132144
    133145      // until convergence
     
    144156        //}
    145157
    146         for (int interaction = 1; interaction <= maxInteractions; interaction++) {
    147           var selectedVars = HeuristicLab.Common.EnumerableExtensions.Combinations(inputVars, interaction);
    148 
    149           foreach (var element in selectedVars) {
    150             var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices);
    151             rss[j] = res.Variance();
    152             terms[j] = string.Format("f({0})", string.Join(",", element));
    153             f[j] = RegressSpline(problemData, element.ToArray(), res, lambda);
    154             j++;
    155           }
     158
     159
     160        foreach (var element in combinations) {
     161          var res = CalculateResiduals(problemData, f, j, avgY, problemData.TrainingIndices);
     162          rss[j] = res.Variance();
     163          terms[j] = string.Format("f({0})", string.Join(",", element));
     164          f[j] = RegressSpline(problemData, element.ToArray(), res, lambda);
     165          j++;
    156166        }
    157167
     
    218228          product = product.Zip(problemData.Dataset.GetDoubleValues(inputVars[i], problemData.TrainingIndices), (pi, vi) => pi * vi).ToArray();
    219229        }
    220         CubicSplineGCV.CubGcvReport report;
    221         return CubicSplineGCV.CalculateCubicSpline(
    222           product,
    223           (double[])target.Clone(),
    224           problemData.TargetVariable, inputVars, out report
    225           );
     230        // CubicSplineGCV.CubGcvReport report;
     231        // return CubicSplineGCV.CalculateCubicSpline(
     232        //   product,
     233        //   (double[])target.Clone(),
     234        //   problemData.TargetVariable, inputVars, out report
     235        //   );
     236
     237        double optTolerance; double cvRMSE;
     238        // find tolerance
     239        // var ensemble = Splines.CalculateSmoothingSplineReinsch(product, (double[])target.Clone(), inputVars, problemData.TargetVariable, out optTolerance, out cvRMSE);
     240        // // train on whole data
     241        // return Splines.CalculateSmoothingSplineReinsch(product, (double[])target.Clone(), inputVars, optTolerance, product.Length - 1, problemData.TargetVariable);
     242
     243
     244        // find tolerance
     245        var bestLambda = -5.0;
     246        double bestCVRMSE = double.PositiveInfinity;
     247        double avgTrainRMSE = double.PositiveInfinity;
     248        for (double curLambda = -5.0; curLambda <= 6.0; curLambda += 1.0) {
     249          var ensemble = Splines.CalculatePenalizedRegressionSpline(product, (double[])target.Clone(), curLambda, problemData.TargetVariable,  inputVars, out avgTrainRMSE, out cvRMSE);
     250          Console.Write("{0} {1} {2}", curLambda, avgTrainRMSE, cvRMSE);
     251          if (bestCVRMSE > cvRMSE) {
     252            Console.Write(" *");
     253            bestCVRMSE = cvRMSE;
     254            bestLambda = curLambda;
     255          }
     256          Console.WriteLine();
     257        }
     258        // train on whole data
     259       return Splines.CalculatePenalizedRegressionSpline(product, (double[])target.Clone(), bestLambda, problemData.TargetVariable, inputVars, out avgTrainRMSE, out cvRMSE);
     260
    226261      } else return new ConstantModel(target.Average(), problemData.TargetVariable);
    227262    }
  • branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/Splines.cs

    r15449 r15450  
    2222using System;
    2323using System.Collections.Generic;
     24using System.Diagnostics;
    2425using System.Linq;
    2526using System.Threading;
     
    156157          case "CUBGCV": {
    157158              CubicSplineGCV.CubGcvReport report;
    158               var model = 
    159                 CubicSplineGCV.CalculateCubicSpline(x, y, 
     159              var model =
     160                CubicSplineGCV.CalculateCubicSpline(x, y,
    160161                Problem.ProblemData.TargetVariable, inputVars, out report);
    161162              var targetVar = Problem.ProblemData.TargetVariable;
     
    165166              Results.Add(new Result("Estimated error variance", new DoubleValue(report.estimatedErrorVariance)));
    166167              Results.Add(new Result("Estimated RSS degrees of freedom", new DoubleValue(report.estimatedRSSDegreesOfFreedom)));
    167               Results.Add(new Result("Estimated treu mean squared error at data points", new DoubleValue(report.estimatedTrueMeanSquaredErrorAtDataPoints)));
     168              Results.Add(new Result("Estimated true mean squared error at data points", new DoubleValue(report.estimatedTrueMeanSquaredErrorAtDataPoints)));
    168169              Results.Add(new Result("Mean square of DF", new DoubleValue(report.meanSquareOfDf)));
    169170              Results.Add(new Result("Mean square residual", new DoubleValue(report.meanSquareResiudal)));
     
    204205    public static IRegressionEnsembleModel CalculatePenalizedRegressionSpline(double[] x, double[] y, double lambda, string targetVar, string[] inputVars,
    205206      out double avgTrainRMSE,
    206       out double loocvRMSE) {
     207      out double cvRMSE) {
    207208      int info;
    208209      alglib.spline1dinterpolant interpolant;
     
    211212      n = (int)Math.Max(50, 3 * Math.Sqrt(n));
    212213
    213       double[] x_loo = new double[x.Length - 1];
    214       double[] y_loo = new double[y.Length - 1];
     214      int folds = 10;
     215      int foldSize = x.Length <= 30 ? 1 : x.Length / folds;
     216      if (foldSize == 1) folds = x.Length;
     217
     218      double[] x_loo = new double[(folds - 1) * foldSize];
     219      double[] y_loo = new double[(folds - 1) * foldSize];
    215220
    216221      var models = new List<IRegressionModel>();
    217       var sumTrainRmse = 0.0;
    218       var sse = 0.0;
    219 
    220       for (int i = 0; i < x.Length; i++) {
    221         Array.Copy(x, 0, x_loo, 0, i);
    222         Array.Copy(x, i + 1, x_loo, i, x.Length - (i + 1));
    223         Array.Copy(y, 0, y_loo, 0, i);
    224         Array.Copy(y, i + 1, y_loo, i, y.Length - (i + 1));
     222      var sumTrainSE = 0.0;
     223      int nTrain = 0;
     224      var sumTestSE = 0.0;
     225      int nTest = 0;
     226
     227
     228      for (int i = 0; i < folds; i++) {
     229        if (i < folds - 1) {
     230          //Console.WriteLine("Copy from {0} to {1} n = {2}", (i + 1) * foldSize, i * foldSize, (folds - i - 1) * foldSize);
     231          Array.Copy(x, (i + 1) * foldSize, x_loo, i * foldSize, (folds - i - 1) * foldSize);
     232          Array.Copy(y, (i + 1) * foldSize, y_loo, i * foldSize, (folds - i - 1) * foldSize);
     233        }
    225234        alglib.spline1dfitpenalized(x_loo, y_loo, n, lambda, out info, out interpolant, out rep);
    226         var y_pred = alglib.spline1dcalc(interpolant, x[i]);
    227         double res = y[i] - y_pred;
    228         sse += res * res;
    229         sumTrainRmse += rep.rmserror;
    230 
    231         var model = new Spline1dModel(interpolant, targetVar, inputVars);
    232         models.Add(model);
    233       }
    234 
    235       loocvRMSE = Math.Sqrt(sse / x.Length);
    236       avgTrainRMSE = sumTrainRmse / x.Length;
     235        Debug.Assert(y_loo.All(yi => yi != 0.0));
     236        Debug.Assert(x_loo.All(xi => xi != 0.0));
     237
     238        // training error
     239        for (int j = 0; j < x_loo.Length; j++) {
     240          var y_pred = alglib.spline1dcalc(interpolant, x[i]);
     241          double res = y[j] - y_pred;
     242          sumTrainSE += res * res;
     243          nTrain++;
     244        }
     245        // test
     246        for (int j = i * foldSize; j < (i + 1) * foldSize; j++) {
     247          var y_pred = alglib.spline1dcalc(interpolant, x[i]);
     248          double res = y[j] - y_pred;
     249          sumTestSE += res * res;
     250          nTest++;
     251        }
     252
     253        models.Add(new Spline1dModel(interpolant, targetVar, inputVars));
     254
     255        if (i < folds - 1) {
     256          //Console.WriteLine("Copy from {0} to {1} n = {2}", i * foldSize, i * foldSize, foldSize);
     257          // add current fold for next iteration
     258          Array.Copy(x, i * foldSize, x_loo, i * foldSize, foldSize);
     259          Array.Copy(y, i * foldSize, y_loo, i * foldSize, foldSize);
     260        }
     261      }
     262
     263      cvRMSE = Math.Sqrt(sumTestSE / nTest);
     264      avgTrainRMSE = Math.Sqrt(sumTrainSE / nTrain);
    237265
    238266      return new RegressionEnsembleModel(models);
     
    352380    public static IRegressionEnsembleModel CalculateSmoothingSplineReinsch(double[] x, double[] y, double tol, string targetVar, string[] inputVars,
    353381      out double avgTrainRMSE,
    354       out double loocvRMSE) {
     382      out double cvRMSE) {
    355383      int info;
    356384
    357       double[] x_loo = new double[x.Length - 1];
    358       double[] y_loo = new double[y.Length - 1];
     385      int folds = 10;
     386      int foldSize = x.Length <= 30 ? 1 : x.Length / folds;
     387      if (foldSize == 1) folds = x.Length;
     388
     389      double[] x_loo = new double[(folds - 1) * foldSize];
     390      double[] y_loo = new double[(folds - 1) * foldSize];
    359391
    360392      var models = new List<IRegressionModel>();
    361       var sumTrainRmse = 0.0;
    362       var sse = 0.0;
    363 
    364       for (int i = 0; i < x.Length; i++) {
    365         Array.Copy(x, i + 1, x_loo, i, x.Length - (i + 1));
    366         Array.Copy(y, i + 1, y_loo, i, y.Length - (i + 1));
    367 
    368         var model = CalculateSmoothingSplineReinsch(x_loo, y_loo, inputVars, tol, x_loo.Length + 1, targetVar);
    369 
    370         var y_pred = model.GetEstimatedValue(x[i]);
    371         double res = y[i] - y_pred;
    372         sse += res * res;
     393      var sumTrainSE = 0.0;
     394      int nTrain = 0;
     395      var sumTestSE = 0.0;
     396      int nTest = 0;
     397
     398
     399      for (int i = 0; i < folds; i++) {
     400        if (i < folds - 1) {
     401          Array.Copy(x, (i + 1) * foldSize, x_loo, i * foldSize, x.Length - ((i + 1) * foldSize) - 1);
     402          Array.Copy(y, (i + 1) * foldSize, y_loo, i * foldSize, y.Length - ((i + 1) * foldSize) - 1);
     403        }
     404        var model = CalculateSmoothingSplineReinsch(x_loo, y_loo, inputVars, tol, ((folds - 1) * foldSize) - 1, targetVar);
     405
     406        // training error
     407        for(int j = 0;j<x_loo.Length;j++) {
     408          var y_pred = model.GetEstimatedValue(x[j]);
     409          double res = y[j] - y_pred;
     410          sumTrainSE += res * res;
     411          nTrain++;
     412        }
     413        // test
     414        for (int j = i * foldSize; j < (i + 1) * foldSize; j++) {
     415          var y_pred = model.GetEstimatedValue(x[j]);
     416          double res = y[j] - y_pred;
     417          sumTestSE += res * res;
     418          nTest++;
     419        }
    373420
    374421        models.Add(model);
    375422
    376         // add current row for next iteration
    377         if (i < x_loo.Length) {
    378           x_loo[i] = x[i];
    379           y_loo[i] = y[i];
    380         }
    381       }
    382 
    383       loocvRMSE = Math.Sqrt(sse / x.Length);
    384       avgTrainRMSE = sumTrainRmse / x.Length;
     423        if (i < folds - 1) {
     424          // add current fold for next iteration
     425          Array.Copy(x, i * foldSize, x_loo, i * foldSize, foldSize);
     426          Array.Copy(y, i * foldSize, y_loo, i * foldSize, foldSize);
     427        }
     428      }
     429
     430      cvRMSE = Math.Sqrt(sumTestSE / nTest);
     431      avgTrainRMSE = Math.Sqrt(sumTrainSE / nTrain);
    385432
    386433      return new RegressionEnsembleModel(models);
     
    388435
    389436
    390     // automatically determines tolerance using loo cv
     437    // automatically determines tolerance using loo cv, SLOW!
    391438    public static IRegressionModel CalculateSmoothingSplineReinsch(double[] x, double[] y, string[] inputVars, string targetVar,
    392       out double optTolerance, out double looRMSE) {
     439      out double optTolerance, out double cvRMSE) {
    393440      var maxTolerance = y.StandardDeviation();
    394441      var minTolerance = maxTolerance * 1E-6;
    395442      optTolerance = maxTolerance;
    396443      var tol = maxTolerance;
    397       looRMSE = double.PositiveInfinity;
     444      cvRMSE = double.PositiveInfinity;
    398445      IRegressionModel bestModel = new ConstantModel(y.Average(), targetVar);
    399446      while (tol > minTolerance) {
    400         double loo_rmse;
     447        double curCVRMSE;
    401448        double avgTrainRmse;
    402449        var model = Splines.CalculateSmoothingSplineReinsch(
    403           x, y, tol, targetVar, inputVars, out avgTrainRmse, out loo_rmse);
    404 
    405         if (loo_rmse < looRMSE) {
    406           looRMSE = loo_rmse;
     450          x, y, tol, targetVar, inputVars, out avgTrainRmse, out curCVRMSE);
     451
     452        if (curCVRMSE < cvRMSE) {
     453          cvRMSE = curCVRMSE;
    407454          optTolerance = tol;
    408455          bestModel = model;
  • branches/MathNetNumerics-Exploration-2789/Main/Main.csproj

    r15449 r15450  
    3838      <HintPath>..\..\..\trunk\sources\bin\ALGLIB-3.7.0.dll</HintPath>
    3939    </Reference>
     40    <Reference Include="HeuristicLab.Algorithms.DataAnalysis-3.4, Version=3.4.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL">
     41      <SpecificVersion>False</SpecificVersion>
     42      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Algorithms.DataAnalysis-3.4.dll</HintPath>
     43    </Reference>
     44    <Reference Include="HeuristicLab.Collections-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec" />
    4045    <Reference Include="HeuristicLab.Common-3.3">
    4146      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Common-3.3.dll</HintPath>
    4247    </Reference>
    4348    <Reference Include="HeuristicLab.Core-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec" />
     49    <Reference Include="HeuristicLab.Optimization-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec" />
    4450    <Reference Include="HeuristicLab.Problems.DataAnalysis-3.4, Version=3.4.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL">
    4551      <SpecificVersion>False</SpecificVersion>
    4652      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Problems.DataAnalysis-3.4.dll</HintPath>
     53    </Reference>
     54    <Reference Include="HeuristicLab.Problems.Instances-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL">
     55      <SpecificVersion>False</SpecificVersion>
     56      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Problems.Instances-3.3.dll</HintPath>
     57    </Reference>
     58    <Reference Include="HeuristicLab.Problems.Instances.DataAnalysis-3.3">
     59      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Problems.Instances.DataAnalysis-3.3.dll</HintPath>
    4760    </Reference>
    4861    <Reference Include="System" />
  • branches/MathNetNumerics-Exploration-2789/Main/Program.cs

    r15449 r15450  
    1212  class Program {
    1313    static void Main(string[] args) {
    14       var xs = HeuristicLab.Common.SequenceGenerator.GenerateSteps(-3.5, 3.5, 0.1, includeEnd: true).ToList();
    15       var ys = xs.Select(xi => 1.0 / Math.Sqrt(2 * Math.PI) * Math.Exp(-0.5 * xi * xi)).ToArray(); // 1.0 / (Math.Sqrt(2 * Math.PI) * Math.Exp(-0.5 * xi * xi))).ToArray();
     14      var provider = new HeuristicLab.Problems.Instances.DataAnalysis.VariousInstanceProvider();
     15      var problemData = provider.LoadData(provider.GetDataDescriptors().First(dd => dd.Name.Contains("Poly")));
     16      // var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider();
     17      // var problemData = provider.LoadData(provider.GetDataDescriptors().First(dd => dd.Name.Contains("Chem")));
    1618
    17       int n = xs.Count();
    18       alglib.hqrndstate state;
    19       alglib.hqrndseed(1234, 5678, out state);
    20       var ys_noise = ys.Select(yi => yi + alglib.hqrndnormal(state) * 0.1).ToList();
     19      var gam = new GAM();
     20      gam.MaxIterations = 10;
     21      gam.MaxInteractions = 3;
     22      gam.Problem.ProblemData = problemData;
     23      gam.Start();
    2124
    22       CubicSplineGCV.CubGcvReport report;
    23       var model = CubicSplineGCV.CalculateCubicSpline(
    24         xs.ToArray(), ys_noise.ToArray(), "y", new string[] { "x" }, out report);
     25      var solution = (IRegressionSolution)gam.Results["Ensemble solution"].Value;
    2526
    26       Console.WriteLine("Smoothing Parameter (= RHO/(RHO + 1) {0}", report.smoothingParameter);
    27       Console.WriteLine("Estimated DOF of RSS {0}", report.estimatedRSSDegreesOfFreedom);
    28       Console.WriteLine("GCV {0}", report.generalizedCrossValidation);
    29       Console.WriteLine("Mean squared residual {0}", report.meanSquareResiudal);
    30       Console.WriteLine("Estimate of true MSE at data points {0}", report.estimatedTrueMeanSquaredErrorAtDataPoints);
    31       Console.WriteLine("Estimate of error variance {0}", report.estimatedErrorVariance);
    32       Console.WriteLine("Mean square value of DF(I) {0}", report.meanSquareOfDf);
    33 
    34       OnlineCalculatorError error;
    35       var ys_smoothed = xs.Select(xi => model.GetEstimatedValue(xi)).ToArray();
    36       var mse = OnlineMeanSquaredErrorCalculator.Calculate(ys, ys_smoothed, out error);
    37       Console.WriteLine("MSE(ys, ys_smooth) = {0}", mse);
    38       mse = OnlineMeanSquaredErrorCalculator.Calculate(ys, ys_noise, out error);
    39       Console.WriteLine("MSE(ys, ys_noise) = {0}", mse);
    40 
    41       Thread.CurrentThread.CurrentCulture = CultureInfo.InvariantCulture;
    42 
    43       for (int i = 0; i < n; i++) {
    44         Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
    45           xs[i], ys[i], ys_smoothed[i], ys_noise[i], ys_smoothed[i] + 1.96 * report.se[i], ys_smoothed[i] - 1.96 * report.se[i]);
    46       }
     27      Console.WriteLine("RMSE (train) {0}", solution.TrainingRootMeanSquaredError);
     28      Console.WriteLine("RMSE (test) {0}", solution.TestRootMeanSquaredError);
    4729    }
    4830  }
  • branches/MathNetNumerics-Exploration-2789/Test/Test.csproj

    r15443 r15450  
    6262      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Algorithms.GeneticAlgorithm-3.3.dll</HintPath>
    6363    </Reference>
     64    <Reference Include="HeuristicLab.Collections-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec" />
    6465    <Reference Include="HeuristicLab.Common-3.3">
    6566      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Common-3.3.dll</HintPath>
     
    9091    <Reference Include="HeuristicLab.Problems.Instances-3.3">
    9192      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Problems.Instances-3.3.dll</HintPath>
     93    </Reference>
     94    <Reference Include="HeuristicLab.Problems.Instances.DataAnalysis-3.3">
     95      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Problems.Instances.DataAnalysis-3.3.dll</HintPath>
    9296    </Reference>
    9397    <Reference Include="HeuristicLab.SequentialEngine-3.3">
  • branches/MathNetNumerics-Exploration-2789/Test/UnitTest1.cs

    r15443 r15450  
    110110
    111111      Console.WriteLine("Best tolerance {0}, RMSE (LOO): {1}", optTol, looRMSE);
     112    }
    112113
     114    [TestMethod]
     115    public void TestGAM() {
     116      var provider = new HeuristicLab.Problems.Instances.DataAnalysis.VariousInstanceProvider();
     117      var problemData = provider.LoadData(provider.GetDataDescriptors().First(dd => dd.Name.Contains("Poly")));
     118      // var provider = new HeuristicLab.Problems.Instances.DataAnalysis.RegressionRealWorldInstanceProvider();
     119      // var problemData = provider.LoadData(provider.GetDataDescriptors().First(dd => dd.Name.Contains("Chem")));
     120
     121      var gam = new GAM();
     122      gam.MaxIterations = 10;
     123      gam.MaxInteractions = 3;
     124      gam.Problem.ProblemData = problemData;
     125      gam.Start();
     126
     127      var solution = (IRegressionSolution)gam.Results["Ensemble solution"].Value;
     128
     129      Console.WriteLine("RMSE (train) {0}", solution.TrainingRootMeanSquaredError);
     130      Console.WriteLine("RMSE (test) {0}", solution.TestRootMeanSquaredError);
    113131    }
    114      
     132
    115133  }
    116134}
Note: See TracChangeset for help on using the changeset viewer.