using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Runtime.InteropServices; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Problems.DataAnalysis; namespace HeuristicLab.Algorithms.DataAnalysis.Experimental { public static class SBART { /* # A Cubic B-spline Smoothing routine. # # The algorithm minimises: # # (1/n) * sum ws(i)**2 * (ys(i)-sz(i))**2 + lambda* int ( sz"(xs) )**2 dxs # # lambda is a function of the spar which is assumed to be between # 0 and 1 # Input # n number of data points # ys(n) vector of length n containing the observations # ws(n) vector containing the weights given to each data point # xs(n) vector containing the ordinates of the observations # nk number of b-spline coefficients to be estimated # nk <= n+2 # knot(nk+4) vector of knot points defining the cubic b-spline basis. # spar penalised likelihood smoothing parameter # ispar indicator saying if spar is supplied or to be estimated # lspar, uspar lower and upper values for spar 0.,1. are good values # tol used in Golden Search routine # isetup setup indicator # icrit indicator saying which cross validation score # is to be computed # ld4 the leading dimension of abd (ie ld4=4) # ldnk the leading dimension of p2ip (not referenced) # Output # coef(nk) vector of spline coefficients # sz(n) vector of smoothed z-values # lev(n) vector of leverages # crit either ordinary of generalized CV score # ier error indicator # ier = 0 ___ everything fine # ier = 1 ___ spar too small or too big # problem in cholesky decomposition # Working arrays/matrix # xwy X'Wy # hs0,hs1,hs2,hs3 the diagonals of the X'WX matrix # sg0,sg1,sg2,sg3 the diagonals of the Gram matrix # abd(ld4,nk) [ X'WX+lambda*SIGMA] in diagonal form # p1ip(ld4,nk) inner products between columns of L inverse # p2ip(ldnk,nk) all inner products between columns of L inverse # L'L = [X'WX+lambdaSIGMA] NOT REFERENCED */ /* * sbart(xs,ys,ws,n,knot,nk, coef,sz,lev, crit,icrit,spar,ispar,lspar,uspar,tol, isetup, xwy, hs0,hs1,hs2,hs3, sg0,sg1,sg2,sg3, abd,p1ip,p2ip,ld4,ldnk,ier) */ // To build the fortran library (x64) use: // > ifort /dll /Qm64 /libs:static /winapp sbart.f interv.f bsplvb.f spbfa.f spbsl.f bvalue.f scopy.f ssort.f sdot.f saxpy.f bsplvd.f /Fesbart_x64.dll // check dumpbin /EXPORTS sbart_x64.dll // and dumpbin /IMPORTS sbart_x64.dll [DllImport("sbart_x64.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "sbart")] public static extern void sbart_x64( float[] xs, float[] ys, float[] ws, ref int n, float[] knot, ref int nk, float[] coeff, float[] sz, float[] lev, ref float crit, ref int icrit, ref float spar, ref int ispar, ref float lspar, ref float uspar, ref float tol, ref int isetup, float[] xwy, float[] hs0, float[] hs1, float[] hs2, float[] hs3, float[] sg0, float[] sg1, float[] sg2, float[] sg3, float[,] abd, float[,] p1ip, float[,] p2ip, ref int ld4, ref int ldnk, ref int ier); [DllImport("sbart_x86.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "sbart")] public static extern void sbart_x86(); [DllImport("sbart_x64.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "sknotl")] public static extern void sknotl_x64(float[] x, ref int n, float[] knot, ref int k); [DllImport("sbart_x64.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "setreg")] public static extern void setreg_x64(float[] x, float[] y, float[] w, ref int n, float[] xw, ref int nx, ref float min, ref float range, float[] knot, ref int nk); /* * calculates value at x of jderiv-th derivative of spline from b-repr. c the spline is taken to be continuous from the right, EXCEPT at the c rightmost knot, where it is taken to be continuous from the left. c c****** i n p u t ****** c t, bcoef, n, k......forms the b-representation of the spline f to c be evaluated. specifically, c t.....knot sequence, of length n+k, assumed nondecreasing. c bcoef.....b-coefficient sequence, of length n . c n.....length of bcoef and dimension of spline(k,t), c a s s u m e d positive . c k.....order of the spline . c c w a r n i n g . . . the restriction k .le. kmax (=20) is imposed c arbitrarily by the dimension statement for aj, dl, dr below, c but is n o w h e r e c h e c k e d for. c c x.....the point at which to evaluate . c jderiv.....integer giving the order of the derivative to be evaluated c a s s u m e d to be zero or positive. c c****** o u t p u t ****** c bvalue.....the value of the (jderiv)-th derivative of f at x . c c****** m e t h o d ****** c The nontrivial knot interval (t(i),t(i+1)) containing x is lo- c cated with the aid of interv . The k b-coeffs of f relevant for c this interval are then obtained from bcoef (or taken to be zero if c not explicitly available) and are then differenced jderiv times to c obtain the b-coeffs of (d**jderiv)f relevant for that interval. c Precisely, with j = jderiv, we have from x.(12) of the text that c c (d**j)f = sum ( bcoef(.,j)*b(.,k-j,t) ) c c where c / bcoef(.), , j .eq. 0 c / c bcoef(.,j) = / bcoef(.,j-1) - bcoef(.-1,j-1) c / ----------------------------- , j .gt. 0 c / (t(.+k-j) - t(.))/(k-j) c c Then, we use repeatedly the fact that c c sum ( a(.)*b(.,m,t)(x) ) = sum ( a(.,x)*b(.,m-1,t)(x) ) c with c (x - t(.))*a(.) + (t(.+m-1) - x)*a(.-1) c a(.,x) = --------------------------------------- c (x - t(.)) + (t(.+m-1) - x) c c to write (d**j)f(x) eventually as a linear combination of b-splines c of order 1 , and the coefficient for b(i,1,t)(x) must then be the c desired number (d**j)f(x). (see x.(17)-(19) of text). */ [DllImport("sbart_x64.dll", CallingConvention = CallingConvention.Cdecl, EntryPoint = "bvalue")] public static extern float bvalue(float[] t, float[] bcoeff, ref int n, ref int k, ref float x, ref int jderiv); public class SBART_Report { public double smoothingParameter; public double gcv; public double[] leverage; } public static IRegressionModel CalculateSBART(double[] x, double[] y, double[] w, int nKnots, string targetVariable, string[] inputVars, out SBART_Report rep) { // use kMeans to find knot points double[,] xy = new double[x.Length, 1]; for (int i = 0; i < x.Length; i++) xy[i, 0] = x[i]; double[,] c; int[] xyc; int info; alglib.kmeansgenerate(xy, x.Length, 1, nKnots, 10, out info, out c, out xyc); var g = x.Zip(xyc, (double xi, int ci) => Tuple.Create(xi,ci)).GroupBy(t => t.Item2).Select(gr => HeuristicLab.Common.EnumerableStatisticExtensions.Median(gr.Select(gi=>gi.Item1))).ToArray(); // find medians double[] knots = new double[nKnots]; for (int i = 0; i < g.Length; i++) knots[i] = g[i]; return CalculateSBART(x, y, w, knots, targetVariable, inputVars, out rep); } public static IRegressionModel CalculateSBART(double[] x, double[] y, double[] w, double[] knots, string targetVariable, string[] inputVars, out SBART_Report rep) { int ier = 99; int tries = 0; float tol = 0.01f; do { tries++; float[] xs = x.Select(xi => (float)xi).ToArray(); float[] ys = y.Select(xi => (float)xi).ToArray(); float[] ws = w.Select(xi => (float)xi).ToArray(); float[] k = knots.Select(xi => (float)xi).ToArray(); int n = xs.Length; if (n < 4) throw new ArgumentException("n < 4"); if (knots.Length > n + 2) throw new ArgumentException("more than n+2 knots"); float[] xw = new float[n]; int nx = -99; float min = 0.0f; float range = 0.0f; int nk = -99; float[] regKnots = new float[n + 6]; // sort xs together with ys and ws // combine rows with duplicate x values // transform x to range [0 .. 1] // create a set of knots (using a heuristic for the number of knots) // knots are located at data points. denser regions of x contain more knots. SBART.setreg_x64(xs, ys, ws, ref n, xw, ref nx, ref min, ref range, regKnots, ref nk); // in this case we want to use the knots supplied by the caller. // the knot values produced by setreg are overwritten with scaled knots supplied by caller. // knots must be ordered as well. int i = 0; // left boundary regKnots[i++] = 0.0f; regKnots[i++] = 0.0f; regKnots[i++] = 0.0f; regKnots[i++] = 0.0f; int j = 1; foreach (var knot in knots.OrderBy(ki => ki)) { regKnots[i++] = xs[j * nx / (knots.Length + 1)]; // ((float)knot - min) / range; j++; } // right boundary regKnots[i++] = 1.0f; regKnots[i++] = 1.0f; regKnots[i++] = 1.0f; regKnots[i++] = 1.0f; nk = i - 4; float criterion = -99.0f; // GCV int icrit = 1; // calculate GCV float smoothingParameter = -99.0f; int smoothingParameterIndicator = 0; float lowerSmoothingParameter = 0.0f; float upperSmoothingParameter = 1.0f; int isetup = 0; // not setup? // results float[] coeff = new float[nk]; float[] leverage = new float[nx]; float[] y_smoothed = new float[nx]; // working arrays for sbart float[] xwy = new float[nk]; float[] hs0 = new float[nk]; float[] hs1 = new float[nk]; float[] hs2 = new float[nk]; float[] hs3 = new float[nk]; float[] sg0 = new float[nk]; float[] sg1 = new float[nk]; float[] sg2 = new float[nk]; float[] sg3 = new float[nk]; int ld4 = 4; float[,] adb = new float[ld4, nk]; float[,] p1ip = new float[nk, ld4]; int ldnk = nk + 4; float[,] p2ip = new float[nk, nx]; SBART.sbart_x64(xs.Take(nx).ToArray(), ys.Take(nx).ToArray(), ws.Take(nx).ToArray(), ref nx, regKnots, ref nk, coeff, y_smoothed, leverage, ref criterion, ref icrit, ref smoothingParameter, ref smoothingParameterIndicator, ref lowerSmoothingParameter, ref upperSmoothingParameter, ref tol, ref isetup, xwy, hs0, hs1, hs2, hs3, sg0, sg1, sg2, sg3, adb, p1ip, p2ip, ref ld4, ref ldnk, ref ier); if (ier > 0) { Console.WriteLine("ERROR {0} smooth {1} criterion {2}", ier, smoothingParameter, criterion); tol *= 2; tol = Math.Min(tol, 1.0f); } else { if (tries > 1) { Console.WriteLine("Success {0} smooth {1} criterion {2}", ier, smoothingParameter, criterion); } rep = new SBART_Report(); rep.gcv = criterion; rep.smoothingParameter = smoothingParameter; rep.leverage = leverage.Select(li => (double)li).ToArray(); return new BartRegressionModel(regKnots.Take(nk + 4).ToArray(), coeff, targetVariable, inputVars, min, range); } } while (ier > 0); throw new ArgumentException(); } public static IRegressionModel CalculateSBART(double[] x, double[] y, string targetVariable, string[] inputVars, out SBART_Report report) { var w = Enumerable.Repeat(1.0, x.Length).ToArray(); int n = x.Length; int ic = n - 1; int ier = -99; int nk = n; float[] knots = new float[nk + 6]; float crit = -99.0f; int icrit = 1; // 0..don't calc CV, 1 .. GCV, 2 CV float smoothingParameter = -99.0f; int smoothingParameterIndicator = 0; float lowerSmoothingParameter = 0f; float upperSmoothingParameter = 1.0f; float tol = 0.02f; int isetup = 0; // not setup? float min = -99.0f; float range = -99.0f; if (Environment.Is64BitProcess) { float[] xw = new float[n]; int nx = -99; float[] xs = x.Select(xi => (float)xi).ToArray(); float[] ys = y.Select(yi => (float)yi).ToArray(); float[] ws = w.Select(wi => (float)wi).ToArray(); // sort xs together with ys and ws // combine rows with duplicate x values // create a set of knots (using a heuristic for the number of knots) // knots are located at data points. denser regions of x contain more knots. SBART.setreg_x64(xs, ys, ws, ref n, xw, ref nx, ref min, ref range, knots, ref nk); /* use all points as knot points nk = nx + 2; knots[0] = xs[0]; knots[1] = xs[0]; knots[2] = xs[0]; Array.Copy(xs, 0, knots, 3, nx); knots[nx + 3] = xs[nx - 1]; knots[nx + 4] = xs[nx - 1]; knots[nx + 5] = xs[nx - 1]; */ /* // use uniform grid of knots nk = 20; knots = new float[nk + 4]; knots[0] = xs[0]; knots[1] = xs[0]; knots[2] = xs[0]; for(int i = 3; i 0) throw new ArgumentException(ier.ToString()); report = new SBART_Report(); report.gcv = crit; report.smoothingParameter = smoothingParameter; report.leverage = leverage.Select(li => (double)li).ToArray(); return new BartRegressionModel(knots.Take(nk+4).ToArray(), coeff, targetVariable, inputVars, min, range); } else { throw new NotSupportedException(); } } public class BartRegressionModel : NamedItem, IRegressionModel { private float[] knots; private float[] bcoeff; private double min; private double range; public string TargetVariable { get; set; } private string[] variablesUsedForPrediction; public IEnumerable VariablesUsedForPrediction { get { return variablesUsedForPrediction; } } public BartRegressionModel(BartRegressionModel orig, Cloner cloner) { this.knots = orig.knots; this.bcoeff = orig.bcoeff; this.min = orig.min; this.range = orig.range; this.TargetVariable = orig.TargetVariable; this.variablesUsedForPrediction = orig.variablesUsedForPrediction; } public BartRegressionModel(float[] knots, float[] bcoeff, string targetVariable, string[] inputVars, double min, double range) { this.variablesUsedForPrediction = inputVars; this.TargetVariable = targetVariable; this.knots = knots; this.bcoeff = bcoeff; this.range = range; this.min = min; } public event EventHandler TargetVariableChanged; public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { return new RegressionSolution(this, (IRegressionProblemData) problemData.Clone()); } public double GetEstimatedValue(double xx) { float x = (float)((xx - min) / range); int n = bcoeff.Length; int k = 4; int zero = 0; int one = 1; int two = 2; // linear extrapolation if (x < 0) { float x0 = 0.0f; var y0 = bvalue(knots, bcoeff, ref n, ref k, ref x0, ref zero); var y0d = bvalue(knots, bcoeff, ref n, ref k, ref x0, ref one); return y0 + x * y0d; } if (x > 1) { float x1 = 1.0f; var y1 = bvalue(knots, bcoeff, ref n, ref k, ref x1, ref zero); var y1d = bvalue(knots, bcoeff, ref n, ref k, ref x1, ref one); return y1 + (x-1) * y1d; } lock (this) { return bvalue(knots, bcoeff, ref n, ref k, ref x, ref zero); } // piecewise constant approximation // if (xx <= x[0]) return bcoeff[0]; // if (xx >= x[n - 1]) return bcoeff[n - 1]; // for(int i=1;i 0 && h2 <= 0) { // if (h1 < h2) return bcoeff[i]; else return bcoeff[i + 1]; // } // } // return 0.0; // // piecewise linear approximation // int n = x.Length; // if (xx <= x[0]) { // double h = xx - x[0]; // return h * (y[1] - y[0]) / (x[1] - x[0]) + y[0]; // } else if (xx >= x[n-1]) { // double h = xx - x[n-1]; // return h * (y[n-1] - y[n-2]) / (x[n-1] - x[n-2]) + y[n-1]; // } else { // // binary search // int lower = 0; // int upper = n-1; // while (true) { // if (upper < lower) throw new InvalidProgramException(); // int i = lower + (upper - lower) / 2; // if (x[i] <= xx && xx < x[i + 1]) { // double h = xx - x[i]; // double k = (y[i + 1] - y[i]) / (x[i + 1] - x[i]); // return h * k + y[i]; // } else if (xx < x[i]) { // upper = i - 1; // } else { // lower = i + 1; // } // } // } // return 0.0; } public IEnumerable GetEstimatedValues(IDataset dataset, IEnumerable rows) { foreach(var x in dataset.GetDoubleValues(VariablesUsedForPrediction.First(), rows)) { yield return GetEstimatedValue(x); } } public override IDeepCloneable Clone(Cloner cloner) { return new BartRegressionModel(this, cloner); } } } }