1 | using System;
|
---|
2 | using System.Collections.Generic;
|
---|
3 | using System.Diagnostics;
|
---|
4 | using System.Linq;
|
---|
5 | using System.Runtime.InteropServices;
|
---|
6 | using System.Text;
|
---|
7 | using System.Threading.Tasks;
|
---|
8 | using HeuristicLab.Problems.DataAnalysis;
|
---|
9 |
|
---|
10 | namespace HeuristicLab.Algorithms.DataAnalysis.Experimental {
|
---|
11 | public static class CubicSplineGCV {
|
---|
12 | // CUBGCV (X,F,DF,N,Y,C,IC,VAR,JOB,SE,WK,IER)
|
---|
13 | // ARGUMENTS X - VECTOR OF LENGTH N CONTAINING THE
|
---|
14 | // ABSCISSAE OF THE N DATA POINTS
|
---|
15 | // (X(I),F(I)) I=1..N. (INPUT) X
|
---|
16 | // MUST BE ORDERED SO THAT
|
---|
17 | // X(I) .LT. X(I+1).
|
---|
18 | // F - VECTOR OF LENGTH N CONTAINING THE
|
---|
19 | // ORDINATES (OR FUNCTION VALUES)
|
---|
20 | // OF THE N DATA POINTS (INPUT).
|
---|
21 | // DF - VECTOR OF LENGTH N. (INPUT/OUTPUT)
|
---|
22 | // DF(I) IS THE RELATIVE STANDARD DEVIATION
|
---|
23 | // OF THE ERROR ASSOCIATED WITH DATA POINT I.
|
---|
24 | // EACH DF(I) MUST BE POSITIVE. THE VALUES IN
|
---|
25 | // DF ARE SCALED BY THE SUBROUTINE SO THAT
|
---|
26 | // THEIR MEAN SQUARE VALUE IS 1, AND UNSCALED
|
---|
27 | // AGAIN ON NORMAL EXIT.
|
---|
28 | // THE MEAN SQUARE VALUE OF THE DF(I) IS RETURNED
|
---|
29 | // IN WK(7) ON NORMAL EXIT.
|
---|
30 | // IF THE ABSOLUTE STANDARD DEVIATIONS ARE KNOWN,
|
---|
31 | // THESE SHOULD BE PROVIDED IN DF AND THE ERROR
|
---|
32 | // VARIANCE PARAMETER VAR (SEE BELOW) SHOULD THEN
|
---|
33 | // BE SET TO 1.
|
---|
34 | // IF THE RELATIVE STANDARD DEVIATIONS ARE UNKNOWN,
|
---|
35 | // SET EACH DF(I)=1.
|
---|
36 | // N - NUMBER OF DATA POINTS (INPUT).
|
---|
37 | // N MUST BE .GE. 3.
|
---|
38 | // Y,C - SPLINE COEFFICIENTS. (OUTPUT) Y
|
---|
39 | // IS A VECTOR OF LENGTH N. C IS
|
---|
40 | // AN N-1 BY 3 MATRIX. THE VALUE
|
---|
41 | // OF THE SPLINE APPROXIMATION AT T IS
|
---|
42 | // S(T)=((C(I,3)*D+C(I,2))*D+C(I,1))*D+Y(I)
|
---|
43 | // WHERE X(I).LE.T.LT.X(I+1) AND
|
---|
44 | // D = T-X(I).
|
---|
45 | // IC - ROW DIMENSION OF MATRIX C EXACTLY
|
---|
46 | // AS SPECIFIED IN THE DIMENSION
|
---|
47 | // STATEMENT IN THE CALLING PROGRAM. (INPUT)
|
---|
48 | // VAR - ERROR VARIANCE. (INPUT/OUTPUT)
|
---|
49 | // IF VAR IS NEGATIVE (I.E. UNKNOWN) THEN
|
---|
50 | // THE SMOOTHING PARAMETER IS DETERMINED
|
---|
51 | // BY MINIMIZING THE GENERALIZED CROSS VALIDATION
|
---|
52 | // AND AN ESTIMATE OF THE ERROR VARIANCE IS
|
---|
53 | // RETURNED IN VAR.
|
---|
54 | // IF VAR IS NON-NEGATIVE (I.E. KNOWN) THEN THE
|
---|
55 | // SMOOTHING PARAMETER IS DETERMINED TO MINIMIZE
|
---|
56 | // AN ESTIMATE, WHICH DEPENDS ON VAR, OF THE TRUE
|
---|
57 | // MEAN SQUARE ERROR, AND VAR IS UNCHANGED.
|
---|
58 | // IN PARTICULAR, IF VAR IS ZERO, THEN AN
|
---|
59 | // INTERPOLATING NATURAL CUBIC SPLINE IS CALCULATED.
|
---|
60 | // VAR SHOULD BE SET TO 1 IF ABSOLUTE STANDARD
|
---|
61 | // DEVIATIONS HAVE BEEN PROVIDED IN DF (SEE ABOVE).
|
---|
62 | // JOB - JOB SELECTION PARAMETER. (INPUT)
|
---|
63 | // JOB = 0 SHOULD BE SELECTED IF POINT STANDARD ERROR
|
---|
64 | // ESTIMATES ARE NOT REQUIRED IN SE.
|
---|
65 | // JOB = 1 SHOULD BE SELECTED IF POINT STANDARD ERROR
|
---|
66 | // ESTIMATES ARE REQUIRED IN SE.
|
---|
67 | // SE - VECTOR OF LENGTH N CONTAINING BAYESIAN STANDARD
|
---|
68 | // ERROR ESTIMATES OF THE FITTED SPLINE VALUES IN Y.
|
---|
69 | // SE IS NOT REFERENCED IF JOB=0. (OUTPUT)
|
---|
70 | // WK - WORK VECTOR OF LENGTH 7*(N + 2). ON NORMAL EXIT THE
|
---|
71 | // FIRST 7 VALUES OF WK ARE ASSIGNED AS FOLLOWS:-
|
---|
72 | //
|
---|
73 | // WK(1) = SMOOTHING PARAMETER (= RHO/(RHO + 1))
|
---|
74 | // WK(2) = ESTIMATE OF THE NUMBER OF DEGREES OF
|
---|
75 | // FREEDOM OF THE RESIDUAL SUM OF SQUARES
|
---|
76 | // WK(3) = GENERALIZED CROSS VALIDATION
|
---|
77 | // WK(4) = MEAN SQUARE RESIDUAL
|
---|
78 | // WK(5) = ESTIMATE OF THE TRUE MEAN SQUARE ERROR
|
---|
79 | // AT THE DATA POINTS
|
---|
80 | // WK(6) = ESTIMATE OF THE ERROR VARIANCE
|
---|
81 | // WK(7) = MEAN SQUARE VALUE OF THE DF(I)
|
---|
82 | //
|
---|
83 | // IF WK(1)=0 (RHO=0) AN INTERPOLATING NATURAL CUBIC
|
---|
84 | // SPLINE HAS BEEN CALCULATED.
|
---|
85 | // IF WK(1)=1 (RHO=INFINITE) A LEAST SQUARES
|
---|
86 | // REGRESSION LINE HAS BEEN CALCULATED.
|
---|
87 | // WK(2) IS AN ESTIMATE OF THE NUMBER OF DEGREES OF
|
---|
88 | // FREEDOM OF THE RESIDUAL WHICH REDUCES TO THE
|
---|
89 | // USUAL VALUE OF N-2 WHEN A LEAST SQUARES REGRESSION
|
---|
90 | // LINE IS CALCULATED.
|
---|
91 | // WK(3),WK(4),WK(5) ARE CALCULATED WITH THE DF(I)
|
---|
92 | // SCALED TO HAVE MEAN SQUARE VALUE 1. THE
|
---|
93 | // UNSCALED VALUES OF WK(3),WK(4),WK(5) MAY BE
|
---|
94 | // CALCULATED BY DIVIDING BY WK(7).
|
---|
95 | // WK(6) COINCIDES WITH THE OUTPUT VALUE OF VAR IF
|
---|
96 | // VAR IS NEGATIVE ON INPUT. IT IS CALCULATED WITH
|
---|
97 | // THE UNSCALED VALUES OF THE DF(I) TO FACILITATE
|
---|
98 | // COMPARISONS WITH A PRIORI VARIANCE ESTIMATES.
|
---|
99 | //
|
---|
100 | // IER - ERROR PARAMETER. (OUTPUT)
|
---|
101 | // TERMINAL ERROR
|
---|
102 | // IER = 129, IC IS LESS THAN N-1.
|
---|
103 | // IER = 130, N IS LESS THAN 3.
|
---|
104 | // IER = 131, INPUT ABSCISSAE ARE NOT
|
---|
105 | // ORDERED SO THAT X(I).LT.X(I+1).
|
---|
106 | // IER = 132, DF(I) IS NOT POSITIVE FOR SOME I.
|
---|
107 | // IER = 133, JOB IS NOT 0 OR 1.
|
---|
108 |
|
---|
109 | // To build the fortran library (x64) use:
|
---|
110 | // > gfortran -m64 642.f -shared -o 642_x64.dll or
|
---|
111 | // > ifort /dll /Qm64 642.f /Fe642_x64.dll
|
---|
112 | // check dumpbin /EXPORTS 642_x64.dll
|
---|
113 | // and dumpbin /IMPORTS 642_x64.dll
|
---|
114 | [DllImport("642_x64.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "cubgcv")]
|
---|
115 | public static extern void cubgcv_x64(
|
---|
116 | double[] x,
|
---|
117 | double[] f,
|
---|
118 | double[] df,
|
---|
119 | ref int n,
|
---|
120 | [Out] double[] y,
|
---|
121 | [Out] double[,] c,
|
---|
122 | ref int ic,
|
---|
123 | ref double var,
|
---|
124 | ref int job,
|
---|
125 | [Out] double[] se,
|
---|
126 | double[,] wk,
|
---|
127 | ref int ier);
|
---|
128 |
|
---|
129 | // To build the fortran library (x86) use:
|
---|
130 | // > gfortran -static -Lc:/mingw/bin/libgcc_s_dw2-1.dll -m32 642.f -shared -o 642_x86.dll
|
---|
131 | [DllImport("642_x86.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "cubgcv")]
|
---|
132 | public static extern void cubgcv_x86(
|
---|
133 | double[] x,
|
---|
134 | double[] f,
|
---|
135 | double[] df,
|
---|
136 | ref Int32 n,
|
---|
137 | double[] y,
|
---|
138 | double[,] c,
|
---|
139 | ref Int32 ic,
|
---|
140 | ref double var,
|
---|
141 | ref Int32 job,
|
---|
142 | double[] se,
|
---|
143 | double[,] wk,
|
---|
144 | ref Int32 ier);
|
---|
145 |
|
---|
146 |
|
---|
147 | public class CubGcvReport {
|
---|
148 | //
|
---|
149 | //
|
---|
150 | //
|
---|
151 | //
|
---|
152 | // IF WK(1)=0 (RHO=0) AN INTERPOLATING NATURAL CUBIC
|
---|
153 | // SPLINE HAS BEEN CALCULATED.
|
---|
154 | // IF WK(1)=1 (RHO=INFINITE) A LEAST SQUARES
|
---|
155 | // REGRESSION LINE HAS BEEN CALCULATED.
|
---|
156 | //
|
---|
157 | public double smoothingParameter;
|
---|
158 | // WK(2) IS AN ESTIMATE OF THE NUMBER OF DEGREES OF
|
---|
159 | // FREEDOM OF THE RESIDUAL WHICH REDUCES TO THE
|
---|
160 | // USUAL VALUE OF N-2 WHEN A LEAST SQUARES REGRESSION
|
---|
161 | // LINE IS CALCULATED.
|
---|
162 | public double estimatedRSSDegreesOfFreedom;
|
---|
163 | // WK(3),WK(4),WK(5) ARE CALCULATED WITH THE DF(I)
|
---|
164 | // SCALED TO HAVE MEAN SQUARE VALUE 1. THE
|
---|
165 | // UNSCALED VALUES OF WK(3),WK(4),WK(5) MAY BE
|
---|
166 | // CALCULATED BY DIVIDING BY WK(7).
|
---|
167 | public double generalizedCrossValidation;
|
---|
168 | public double meanSquareResiudal;
|
---|
169 | public double estimatedTrueMeanSquaredErrorAtDataPoints;
|
---|
170 |
|
---|
171 | // WK(6) COINCIDES WITH THE OUTPUT VALUE OF VAR IF
|
---|
172 | // VAR IS NEGATIVE ON INPUT. IT IS CALCULATED WITH
|
---|
173 | // THE UNSCALED VALUES OF THE DF(I) TO FACILITATE
|
---|
174 | // COMPARISONS WITH A PRIORI VARIANCE ESTIMATES.
|
---|
175 | public double estimatedErrorVariance;
|
---|
176 | public double meanSquareOfDf;
|
---|
177 | public double[] se;
|
---|
178 | }
|
---|
179 | public static ReinschSmoothingSplineModel CalculateCubicSpline(double[] x, double[] y,
|
---|
180 | string targetVariable, string[] inputVars,
|
---|
181 | out CubGcvReport report) {
|
---|
182 | var w = Enumerable.Repeat(1.0, x.Length).ToArray();
|
---|
183 | double[] x2, y2, w2;
|
---|
184 | Splines.SortAndBin(x, y, w, out x2, out y2, out w2);
|
---|
185 | // TODO: use weights correctly
|
---|
186 |
|
---|
187 | int n = x2.Length;
|
---|
188 | double[] df = Enumerable.Repeat(1.0, n).ToArray(); // set each df if actual standard deviation of data points is not known
|
---|
189 | double[,] c = new double[3, n - 1];
|
---|
190 | double[] se = new double[n]; // standard errors in points
|
---|
191 | double[,] wk = new double[7, (n + 2)]; // work array;
|
---|
192 | double[] y_smoothed = new double[n];
|
---|
193 | int job = 1; // calc estimates of standard errors in points
|
---|
194 | int ic = n - 1;
|
---|
195 | int ier = -99;
|
---|
196 | double var = -99.0;
|
---|
197 | if (Environment.Is64BitProcess) {
|
---|
198 | CubicSplineGCV.cubgcv_x64(x2, y2, df, ref n, y_smoothed,
|
---|
199 | c, ref ic, ref var, ref job, se, wk, ref ier);
|
---|
200 | } else {
|
---|
201 | CubicSplineGCV.cubgcv_x86(x2, y2, df, ref n, y_smoothed,
|
---|
202 | c, ref ic, ref var, ref job, se, wk, ref ier);
|
---|
203 |
|
---|
204 |
|
---|
205 | }
|
---|
206 |
|
---|
207 | /*
|
---|
208 | * IER - ERROR PARAMETER. (OUTPUT)
|
---|
209 | * TERMINAL ERROR
|
---|
210 | * IER = 129, IC IS LESS THAN N-1.
|
---|
211 | * IER = 130, N IS LESS THAN 3.
|
---|
212 | * IER = 131, INPUT ABSCISSAE ARE NOT
|
---|
213 | * ORDERED SO THAT X(I).LT.X(I+1).
|
---|
214 | * IER = 132, DF(I) IS NOT POSITIVE FOR SOME I.
|
---|
215 | * IER = 133, JOB IS NOT 0 OR 1.
|
---|
216 | *
|
---|
217 | */
|
---|
218 |
|
---|
219 | if (ier == 131) {
|
---|
220 | throw new ArgumentException("x is not ordered");
|
---|
221 | } else if (ier != 0) throw new ArgumentException("Error in CUBGCV " + ier);
|
---|
222 |
|
---|
223 | // OF THE SPLINE APPROXIMATION AT T IS
|
---|
224 | // S(T)=((C(I,3)*D+C(I,2))*D+C(I,1))*D+Y(I)
|
---|
225 | // WHERE X(I).LE.T.LT.X(I+1) AND
|
---|
226 | // D = T-X(I).
|
---|
227 |
|
---|
228 | // WK(1) = SMOOTHING PARAMETER (= RHO/(RHO + 1))
|
---|
229 | // WK(2) = ESTIMATE OF THE NUMBER OF DEGREES OF
|
---|
230 | // FREEDOM OF THE RESIDUAL SUM OF SQUARES
|
---|
231 | // WK(3) = GENERALIZED CROSS VALIDATION
|
---|
232 | // WK(4) = MEAN SQUARE RESIDUAL
|
---|
233 | // WK(5) = ESTIMATE OF THE TRUE MEAN SQUARE ERROR
|
---|
234 | // AT THE DATA POINTS
|
---|
235 | // WK(6) = ESTIMATE OF THE ERROR VARIANCE
|
---|
236 | // WK(7) = MEAN SQUARE VALUE OF THE DF(I)
|
---|
237 | report = new CubGcvReport();
|
---|
238 | report.smoothingParameter = wk[0, 0];
|
---|
239 | report.estimatedRSSDegreesOfFreedom = wk[0, 1];
|
---|
240 | report.generalizedCrossValidation = wk[0, 2];
|
---|
241 | report.meanSquareResiudal = wk[0, 3];
|
---|
242 | report.estimatedTrueMeanSquaredErrorAtDataPoints = wk[0, 4];
|
---|
243 | report.estimatedErrorVariance = wk[0, 5];
|
---|
244 | report.meanSquareOfDf = wk[0, 6];
|
---|
245 | report.se = se;
|
---|
246 | // Debug.Assert(var == report.estimatedErrorVariance);
|
---|
247 |
|
---|
248 |
|
---|
249 |
|
---|
250 | double[] dArr = new double[n];
|
---|
251 | double[] cArr = new double[n];
|
---|
252 | double[] bArr = new double[n];
|
---|
253 |
|
---|
254 | for (int i = 0; i < n - 1; i++) {
|
---|
255 | bArr[i] = c[0, i];
|
---|
256 | cArr[i] = c[1, i];
|
---|
257 | dArr[i] = c[2, i];
|
---|
258 | }
|
---|
259 |
|
---|
260 | // extrapolate linearly for xx > x[n]
|
---|
261 | bArr[n - 1] = bArr[n - 2];
|
---|
262 | // extrapolate linearly for xx < x[1]
|
---|
263 | dArr[0] = 0;
|
---|
264 |
|
---|
265 | return new ReinschSmoothingSplineModel(
|
---|
266 | new MyArray<double>(1, y_smoothed),
|
---|
267 | new MyArray<double>(1, bArr),
|
---|
268 | new MyArray<double>(1, cArr),
|
---|
269 | new MyArray<double>(1, dArr),
|
---|
270 | new MyArray<double>(1, x2),
|
---|
271 | targetVariable, inputVars);
|
---|
272 |
|
---|
273 |
|
---|
274 | }
|
---|
275 |
|
---|
276 | }
|
---|
277 | }
|
---|