Changeset 15365
- Timestamp:
- 09/17/17 18:06:27 (7 years ago)
- Location:
- branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/GAM.cs
r15364 r15365 109 109 110 110 rmseRow.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TrainingIndices).StandardDeviation()); // -1 index to use all predictors 111 rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation()); 111 rmseRowTest.Values.Add(CalculateResiduals(problemData, f, -1, avgY, problemData.TestIndices).StandardDeviation()); 112 113 if (cancellationToken.IsCancellationRequested) break; 112 114 } 113 115 … … 135 137 private IRegressionModel RegressLR(IRegressionProblemData problemData, string inputVar, double[] target) { 136 138 // Umständlich! 137 var ds = new Dataset(new string[] { inputVar, "y" }, new[] { problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToList(), target.ToList() }); 138 var pd = new RegressionProblemData(ds, new string[] { inputVar }, "y"); 139 var ds = ((Dataset)problemData.Dataset).ToModifiable(); 140 ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList<double>()); 141 var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable); 142 pd.TrainingPartition.Start = problemData.TrainingPartition.Start; 143 pd.TrainingPartition.End = problemData.TrainingPartition.End; 144 pd.TestPartition.Start = problemData.TestPartition.Start; 145 pd.TestPartition.End = problemData.TestPartition.End; 139 146 double rmsError, cvRmsError; 140 147 return LinearRegression.CreateLinearRegressionSolution(pd, out rmsError, out cvRmsError).Model; … … 142 149 143 150 private IRegressionModel RegressSpline(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) { 144 // Umständlich! 145 return Splines.CalculateSmoothingSplineReinsch( 146 problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToArray(), 147 (double[])target.Clone(), 148 new string[] { inputVar }, 149 s: lambda, 150 targetVar: problemData.TargetVariable); 151 } 152 151 if (problemData.Dataset.VariableHasType<double>(inputVar)) { 152 // Umständlich! 153 return Splines.CalculateSmoothingSplineReinschWithAutomaticTolerance( 154 problemData.Dataset.GetDoubleValues(inputVar, problemData.TrainingIndices).ToArray(), 155 (double[])target.Clone(), 156 new string[] { inputVar }, 157 targetVar: problemData.TargetVariable); 158 } else return new ConstantModel(target.Average(), problemData.TargetVariable); 159 } 160 private IRegressionModel RegressRF(IRegressionProblemData problemData, string inputVar, double[] target, double lambda) { 161 if (problemData.Dataset.VariableHasType<double>(inputVar)) { 162 // Umständlich! 163 var ds = ((Dataset)problemData.Dataset).ToModifiable(); 164 ds.ReplaceVariable(problemData.TargetVariable, target.Concat(Enumerable.Repeat(0.0, ds.Rows - target.Length)).ToList<double>()); 165 var pd = new RegressionProblemData(ds, new string[] { inputVar }, problemData.TargetVariable); 166 pd.TrainingPartition.Start = problemData.TrainingPartition.Start; 167 pd.TrainingPartition.End = problemData.TrainingPartition.End; 168 pd.TestPartition.Start = problemData.TestPartition.Start; 169 pd.TestPartition.End = problemData.TestPartition.End; 170 double rmsError, oobRmsError; 171 double avgRelError, oobAvgRelError; 172 return RandomForestRegression.CreateRandomForestRegressionModel(pd, 100, 0.5, 0.5, 1234, out rmsError, out avgRelError, out oobRmsError, out oobAvgRelError); 173 } else return new ConstantModel(target.Average(), problemData.TargetVariable); 174 } 153 175 } 154 176 -
branches/MathNetNumerics-Exploration-2789/HeuristicLab.Algorithms.DataAnalysis.Experimental/Splines.cs
r15364 r15365 36 36 using HeuristicLab.Problems.DataAnalysis.Symbolic; 37 37 using HeuristicLab.Problems.DataAnalysis.Symbolic.Regression; 38 using HeuristicLab.Random; 38 39 using MathNet.Numerics.Interpolation; 39 40 using MathNet.Numerics.LinearAlgebra.Double; … … 70 71 "Smoothing Spline (Mine)", 71 72 "Smoothing Spline (Reinsch)", 73 "Smoothing Spline (Reinsch with automatic tolerance determination using CV)", 74 "B-Spline Smoothing" 72 75 }.Select(s => new StringValue(s))); 73 76 74 77 Parameters.Add(new ConstrainedValueParameter<StringValue>("Type", "The type of spline (as supported by alglib)", validTypes, validTypes.First())); 75 78 Parameters.Add(new ValueParameter<DoubleValue>("Lambda", "Regularization parameter for smoothing splines (0..+inf)", new DoubleValue(100))); 79 Parameters.Add(new ValueParameter<DoubleValue>("StdDev (noise)", "Known error in y values. Used only be Reinsch Smoothing Splines", new DoubleValue(0.1))); 76 80 Problem = new RegressionProblem(); 77 81 } … … 145 149 break; 146 150 } 151 case "Smoothing Spline (Reinsch with automatic tolerance determination using CV)": { 152 var model = CalculateSmoothingSplineReinschWithAutomaticTolerance(x, y, inputVars, Problem.ProblemData.TargetVariable); 153 Results.Add(new Result("Solution", new RegressionSolution(model, (IRegressionProblemData)Problem.ProblemData.Clone()))); 154 break; 155 } 156 case "B-Spline Smoothing": { 157 BSplineSmoothing(x, y, inputVars); 158 break; 159 } 147 160 default: throw new NotSupportedException(); 148 161 } … … 151 164 } 152 165 153 // array with non-zero lower index 154 private class MyArray<T> { 155 private T[] arr; 156 private int lowerBound; 157 158 public T this[int key] { 159 get { 160 return arr[key - lowerBound]; 161 } 162 set { 163 arr[key - lowerBound] = value; 164 } 165 } 166 167 public MyArray(int lowerBound, int numElements) { 168 this.lowerBound = lowerBound; 169 arr = new T[numElements]; 170 } 171 public MyArray(int lowerBound, T[] source) : this(lowerBound, source.Length) { 172 Array.Copy(source, arr, source.Length); 173 } 174 175 public T[] ToArray() { 176 var res = new T[arr.Length]; 177 Array.Copy(arr, res, res.Length); 178 return res; 179 } 166 public void BSplineSmoothing(double[] x, double[] y, string[] inputVars) { 167 Array.Sort(x, y); 168 169 CubicSpline2(x, y, inputVars); 170 171 // // see Elements of Statistical Learning Appendix: Computations for Splines 172 // int M = 4; // order of spline (cubic) 173 // int K = x.Length; 174 // int n = x.Length; 175 // 176 // double[] t = new double[K + 2 * M]; 177 // for (int i = 0; i < M; i++) t[i] = x[0]; 178 // for (int i = K; i < K + 2 * M; i++) t[i] = x[K - 1]; 179 // 180 // double[,] B = new double[n, n + 4]; // basis function matrix 181 // double[,] W = new double[n + 4, n + 4]; // penalty matrix (W[j,k] = 182 // for (int i = 0; i < M; i++) B[i] = new double[K + 2 * M]; 183 // for (int j = 0; j < K; j++) { 184 // for (int i = 0; i < K + 2M - 1; i++) { 185 // if (t[i] <= x[j] && x[j] < t[i + 1]) 186 // B[1][i] = 1.0; 187 // } 188 // } 189 190 191 } 192 193 private struct SplineData { 194 public double a, b, c, d, x, y; 195 }; 196 197 198 /* 199 * The procedure Quincunx, takes as arguments 200 the vectors u, v and w which are respectively the diagonal, the first supradiagonal 201 and the second supradiagonal of the banded matrix. The vector on 202 the LHS of the equation (80) is placed in q which contains the solution on the 203 completion of the procedure. 204 */ 205 private void Quincunx(int n, MyArray<double> u, MyArray<double> v, MyArray<double> w, MyArray<double> q) { 206 // { factorisation} 207 u[-1] = 0; 208 u[0] = 0; 209 for (int j = 1; j <= n - 1; j++) { 210 u[j] = u[j] - u[j - 2] * Math.Pow(w[j - 2], 2) - u[j - 1] * Math.Pow(v[j - 1], 2); 211 v[j] = (v[j] - u[j - 1] * v[j - 1] * w[j - 1]) / u[j]; 212 w[j] = w[j] / u[j]; 213 } 214 // { forward substitution} 215 for (int j = 1; j <= n - 1; j++) { 216 q[j] = q[j] - v[j - 1] * q[j - 1] - w[j - 2] * q[j - 2]; 217 } 218 for (int j = 1; j < n - 1; j++) { 219 q[j] = q[j] / u[j]; 220 } 221 // { back substitution} 222 q[n + 1] = 0; 223 q[n] = 0; 224 for (int j = n - 1; j >= 1; j--) { 225 q[j] = q[j] - v[j] * q[j + 1] - w[j] * q[j + 2]; 226 } 227 } 228 229 public void CubicSpline2(double[] x, double[] y, string[] inputVars) { 230 // see Pollock: Smoothing Splines 231 int n = x.Length; 232 double lambda = 1.0; 233 double[] sigma = Enumerable.Repeat(1.0, n).ToArray(); 234 SplineData[] S = new SplineData[x.Length + 1]; 235 for (int i = 0; i < x.Length; i++) { 236 S[i].x = x[i]; 237 S[i].y = y[i]; 238 } 239 S[x.Length].x = S[x.Length - 1].x; 240 S[x.Length].y = S[x.Length - 1].y; 241 242 // var 243 double[] h = new double[n], 244 r = new double[n], 245 f = new double[n], 246 p = new double[n]; 247 var q = new MyArray<double>(-1, n + 3); 248 var u = new MyArray<double>(-1, n + 3); 249 var v = new MyArray<double>(-1, n + 3); 250 var w = new MyArray<double>(-1, n + 3); 251 double mu; 252 253 mu = 2 * (1 - lambda) / (3 * lambda); 254 h[0] = S[1].x - S[0].x; 255 r[0] = 3 / h[0]; 256 for (int i = 1; i < n - 1; i++) { 257 h[i] = S[i + 1].x - S[i].x; 258 r[i] = 3 / h[i]; 259 f[i] = -(r[i - 1] + r[i]); 260 p[i] = 2 * (S[i + 1].x - S[i - 1].x); 261 q[i] = 3 * (S[i + 1].y - S[i].y) / h[i] - 3 * (S[i].y - S[i - 1].y) / h[i - 1]; 262 } 263 for (int i = 1; i < n; i++) { 264 u[i] = Math.Pow(r[i - 1], 2) * sigma[i - 1] + Math.Pow(f[i], 2) * sigma[i] + Math.Pow(r[i], 2) * sigma[i + 1]; // TODO Sigma 1..n 265 u[i] = mu * u[i] + p[i]; 266 v[i] = f[i] * r[i] * sigma[i] + r[i] * f[i + 1] * sigma[i + 1]; 267 v[i] = mu * v[i] + h[i]; 268 w[i] = mu * r[i] * r[i + 1] * sigma[i + 1]; 269 } 270 Quincunx(n, u, v, w, q); 271 // { Spline P arameters} 272 S[0].d = S[0].y - mu * r[0] * q[1] * sigma[0]; 273 S[1].d = S[1].y - mu * (f[1] * q[1] + r[1] * q[2]) * sigma[0]; 274 S[0].a = q[1] / (3 * h[0]); 275 S[0].b = 0; 276 S[0].c = (S[1].d - S[0].d) / h[0] - q[1] * h[0] / 3; 277 r[0] = 0; 278 for (int j = 1; j < n; j++) { 279 S[j].a = (q[j + 1] - q[j]) / (3 * h[j]); 280 S[j].b = q[j]; 281 S[j].c = (q[j] + q[j - 1]) * h[j - 1] + S[j - 1].c; 282 S[j].d = r[j - 1] * q[j - 1] + f[j] * q[j] + r[j] * q[j + 1]; 283 S[j].d = y[j] - mu * S[j].d * sigma[j]; 284 } 285 286 // { SmoothingSpline} 180 287 } 181 288 182 289 public void CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars) { 183 double s = ((IValueParameter<DoubleValue>)Parameters["Lambda"]).Value.Value; // controls extent of smoothing 184 185 var model = CalculateSmoothingSplineReinsch(xOrig, yOrig, inputVars, s, Problem.ProblemData.TargetVariable); 290 double s = xOrig.Length + 1; 291 double stdTol = ((IValueParameter<DoubleValue>)Parameters["StdDev (noise)"]).Value.Value; // controls extent of smoothing 292 293 var model = CalculateSmoothingSplineReinsch(xOrig, yOrig, inputVars, stdTol, s, Problem.ProblemData.TargetVariable); 186 294 187 295 Results.Add(new Result("Solution", new RegressionSolution(model, (IRegressionProblemData)Problem.ProblemData.Clone()))); … … 190 298 } 191 299 192 public static IRegressionModel CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars, double s, string targetVar) { 300 301 302 public static IRegressionModel CalculateSmoothingSplineReinschWithAutomaticTolerance(double[] xOrig, double[] yOrig, string[] inputVars, string targetVar) { 303 int n = xOrig.Length; 304 var rows = Enumerable.Range(0, n); 305 var rand = new FastRandom(1234); 306 var trainRows = rows.Shuffle(rand).Take((int)(n * 0.66)).ToArray(); 307 var testRows = rows.Except(trainRows).ToArray(); 308 309 var trainX = trainRows.Select(i => xOrig[i]).ToArray(); 310 var trainY = trainRows.Select(i => yOrig[i]).ToArray(); 311 var testX = testRows.Select(i => xOrig[i]).ToArray(); 312 var testY = testRows.Select(i => yOrig[i]).ToArray(); 313 314 var shuffledDs = new Dataset(inputVars.Concat(new string[] { targetVar }), new[] { trainX.Concat(testX).ToArray(), trainY.Concat(testY).ToArray() }); 315 var shuffeledProblemData = new RegressionProblemData(shuffledDs, inputVars, targetVar); 316 shuffeledProblemData.TrainingPartition.Start = 0; 317 shuffeledProblemData.TrainingPartition.End = trainX.Length; 318 shuffeledProblemData.TestPartition.Start = trainX.Length; 319 shuffeledProblemData.TestPartition.End = shuffledDs.Rows + 1; 320 321 double curTol = trainY.StandardDeviation(); 322 double prevTestRMSE = double.PositiveInfinity; 323 double testRMSE = double.PositiveInfinity; 324 IRegressionModel prevModel = null; 325 IRegressionModel model = null; 326 do { 327 prevModel = model; 328 prevTestRMSE = testRMSE; 329 model = CalculateSmoothingSplineReinsch(trainX, trainY, inputVars, curTol, s: trainX.Length + 1, targetVar: targetVar); 330 331 var sol = model.CreateRegressionSolution(shuffeledProblemData); 332 var trainRMSE = sol.TrainingRootMeanSquaredError; 333 testRMSE = sol.TestRootMeanSquaredError; 334 curTol = Math.Max(curTol * 0.5, 1e-12 * trainY.StandardDeviation()); 335 } while (testRMSE < prevTestRMSE); 336 return prevModel; 337 } 338 339 340 public static IRegressionModel CalculateSmoothingSplineReinsch(double[] xOrig, double[] yOrig, string[] inputVars, double stdTol, double s, string targetVar) { 193 341 var minX = xOrig.Min(); 194 342 var maxX = xOrig.Max(); 195 343 var range = maxX - minX; 196 344 197 SortAndBin(xOrig, yOrig, out xOrig, out yOrig, scaling: true); 345 double[] w = Enumerable.Repeat(stdTol, xOrig.Length).ToArray(); 346 SortAndBin((double[])xOrig.Clone(), (double[])yOrig.Clone(), w, out xOrig, out yOrig, out w, scaling: false); 198 347 199 348 // See Smoothing by Spline Functions, Reinsch, 1967 … … 202 351 var x = new MyArray<double>(1, xOrig); 203 352 var y = new MyArray<double>(1, yOrig); 204 var inv_dy = new MyArray<double>(1, Enumerable.Repeat(1.0, n).ToArray()); // equal weights353 var inv_dy = new MyArray<double>(1, w); 205 354 206 355 int n1 = 1; 207 int n2 = n - 1;356 int n2 = n; 208 357 209 358 // results … … 283 432 } 284 433 285 return new ReinschSmoothingSplineModel(a .ToArray(), b.ToArray(), c.ToArray(), d.ToArray(), xOrig, targetVar, inputVars, minX, 1.0 / range);434 return new ReinschSmoothingSplineModel(a, b, c, d, x, targetVar, inputVars); 286 435 } 287 436 288 437 private void CalculateSmoothingSpline(double[] x, double[] y, string[] inputVars) { 289 438 // see Smoothing and Non-Parametric Regression, Germán Rodríguez, 2001 2.3.1 290 SortAndBin(x, y, out x, out y, scaling: false); 439 double[] w = Enumerable.Repeat(1.0, x.Length).ToArray(); // weights necessary for sortAndBin but are ignored below (TODO) 440 SortAndBin(x, y, w, out x, out y, out w, scaling: false); 291 441 int n = x.Length; 292 442 … … 314 464 } 315 465 466 // WInvD = W.Cholesky().Solve(delta); 316 467 var solvResult = W.TrySolveIterative(delta, WInvD, new MlkBiCgStab()); 317 468 … … 339 490 340 491 Results.Add(new Result("Solution", new RegressionSolution(new SmoothingSplineModel(s, x, Problem.ProblemData.TargetVariable, inputVars), 341 (IRegressionProblemData)Problem.ProblemData.Clone()))); 342 } 343 344 private static void SortAndBin(double[] x, double[] y, out double[] x2, out double[] y2, bool scaling = false) { 345 // sort y by x 346 Array.Sort(x, y); 492 (IRegressionProblemData)Problem.ProblemData.Clone()))); 493 } 494 495 private static void SortAndBin(double[] x, double[] y, double[] w, out double[] x2, out double[] y2, out double[] w2, bool scaling = false) { 496 var sortedIdx = Enumerable.Range(0, x.Length).ToArray(); 497 // sort by x 498 Array.Sort(x, sortedIdx); 347 499 348 500 var xl = new List<double>(); 349 501 var yl = new List<double>(); 502 var wl = new List<double>(); 350 503 351 504 int n = x.Length; … … 360 513 double sumX = 0.0; 361 514 double sumY = 0.0; 515 double sumW = 0.0; 362 516 while (j < n && x[j] - x[i] <= binSize) { 363 517 k++; 364 518 sumX += x[j]; 365 sumY += y[j]; 519 sumY += y[sortedIdx[j]]; 520 sumW += w[sortedIdx[j]]; 366 521 j++; 367 522 } 368 523 var avgX = sumX / k; 369 524 if (scaling) avgX = (avgX - x[0]) / range; 370 xl.Add(avgX); // scale525 xl.Add(avgX); 371 526 yl.Add(sumY / k); 527 wl.Add(sumW); 372 528 i += k; 373 529 } … … 376 532 x2 = xl.ToArray(); 377 533 y2 = yl.ToArray(); 534 w2 = wl.ToArray(); 378 535 } 379 536 … … 390 547 391 548 549 // array with non-zero lower index 550 internal class MyArray<T> { 551 private T[] arr; 552 private int lowerBound; 553 554 public int Length { get { return arr.Length; } } 555 556 public T this[int key] { 557 get { 558 return arr[key - lowerBound]; 559 } 560 set { 561 arr[key - lowerBound] = value; 562 } 563 } 564 565 public MyArray(int lowerBound, int numElements) { 566 this.lowerBound = lowerBound; 567 arr = new T[numElements]; 568 } 569 public MyArray(int lowerBound, T[] source) : this(lowerBound, source.Length) { 570 Array.Copy(source, arr, source.Length); 571 } 572 573 public T[] ToArray() { 574 var res = new T[arr.Length]; 575 Array.Copy(arr, res, res.Length); 576 return res; 577 } 578 } 579 392 580 393 581 // UNFINISHED 394 publicclass ReinschSmoothingSplineModel : NamedItem, IRegressionModel {395 private double[]a;396 private double[]b;397 private double[]c;398 private double[]d;399 private double[]x;582 internal class ReinschSmoothingSplineModel : NamedItem, IRegressionModel { 583 private MyArray<double> a; 584 private MyArray<double> b; 585 private MyArray<double> c; 586 private MyArray<double> d; 587 private MyArray<double> x; 400 588 private double offset; 401 589 private double scale; … … 418 606 this.offset = orig.offset; 419 607 } 420 public ReinschSmoothingSplineModel(double[] a, double[] b, double[] c, double[] d, double[] x, string targetVar, string[] inputs, double offset, double scale) : base("SplineModel", "SplineModel") { 608 public ReinschSmoothingSplineModel( 609 MyArray<double> a, 610 MyArray<double> b, 611 MyArray<double> c, 612 MyArray<double> d, 613 MyArray<double> x, 614 string targetVar, string[] inputs, double offset = 0, double scale = 1) : base("SplineModel", "SplineModel") { 421 615 this.a = a; 422 616 this.b = b; … … 428 622 this.scale = scale; 429 623 this.offset = offset; 624 625 // extrapolate for xx > x[n2] 626 b[b.Length] = b[b.Length - 1]; 627 d[b.Length] = d[d.Length - 1]; 430 628 } 431 629 … … 441 639 int n = x.Length; 442 640 foreach (var xx in dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows).Select(xi => (xi - offset) * scale)) { 443 if (xx < x[0]) yield return a[0]; 444 else if (xx >= x[n - 1]) yield return a[n - 2]; 445 else { 641 if (xx <= x[1]) { 642 double h = xx - x[1]; 643 yield return a[1] + h * (b[1] + h * (c[1] + h * d[1])); 644 } else if (xx >= x[n]) { 645 double h = xx - x[n]; 646 yield return a[n] + h * (b[n] + h * (c[n] + h * d[n])); 647 } else { 446 648 // binary search 447 int lower = 0;448 int upper = n - 1;649 int lower = 1; 650 int upper = n; 449 651 while (true) { 450 652 if (upper < lower) throw new InvalidProgramException();
Note: See TracChangeset
for help on using the changeset viewer.