/************************************************************************* Copyright (c) Sergey Bochkanov (ALGLIB project). >>> SOURCE LICENSE >>> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation (www.fsf.org); either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. A copy of the GNU General Public License is available at http://www.fsf.org/licensing/licenses >>> END OF LICENSE >>> *************************************************************************/ #pragma warning disable 162 #pragma warning disable 219 using System; public partial class alglib { /************************************************************************* *************************************************************************/ public class decisionforest { // // Public declarations // public decisionforest() { _innerobj = new dforest.decisionforest(); } // // Although some of declarations below are public, you should not use them // They are intended for internal use only // private dforest.decisionforest _innerobj; public dforest.decisionforest innerobj { get { return _innerobj; } } public decisionforest(dforest.decisionforest obj) { _innerobj = obj; } } /************************************************************************* *************************************************************************/ public class dfreport { // // Public declarations // public double relclserror { get { return _innerobj.relclserror; } set { _innerobj.relclserror = value; } } public double avgce { get { return _innerobj.avgce; } set { _innerobj.avgce = value; } } public double rmserror { get { return _innerobj.rmserror; } set { _innerobj.rmserror = value; } } public double avgerror { get { return _innerobj.avgerror; } set { _innerobj.avgerror = value; } } public double avgrelerror { get { return _innerobj.avgrelerror; } set { _innerobj.avgrelerror = value; } } public double oobrelclserror { get { return _innerobj.oobrelclserror; } set { _innerobj.oobrelclserror = value; } } public double oobavgce { get { return _innerobj.oobavgce; } set { _innerobj.oobavgce = value; } } public double oobrmserror { get { return _innerobj.oobrmserror; } set { _innerobj.oobrmserror = value; } } public double oobavgerror { get { return _innerobj.oobavgerror; } set { _innerobj.oobavgerror = value; } } public double oobavgrelerror { get { return _innerobj.oobavgrelerror; } set { _innerobj.oobavgrelerror = value; } } public dfreport() { _innerobj = new dforest.dfreport(); } // // Although some of declarations below are public, you should not use them // They are intended for internal use only // private dforest.dfreport _innerobj; public dforest.dfreport innerobj { get { return _innerobj; } } public dfreport(dforest.dfreport obj) { _innerobj = obj; } } /************************************************************************* This subroutine builds random decision forest. INPUT PARAMETERS: XY - training set NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - task type: * NClasses=1 - regression task with one dependent variable * NClasses>1 - classification task with NClasses classes. NTrees - number of trees in a forest, NTrees>=1. recommended values: 50-100. R - percent of a training set used to build individual trees. 01). * 1, if task has been solved DF - model built Rep - training report, contains error on a training set and out-of-bag estimates of generalization error. -- ALGLIB -- Copyright 19.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfbuildrandomdecisionforest(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, double r, out int info, out decisionforest df, out dfreport rep) { info = 0; df = new decisionforest(); rep = new dfreport(); dforest.dfbuildrandomdecisionforest(xy, npoints, nvars, nclasses, ntrees, r, ref info, df.innerobj, rep.innerobj); return; } /************************************************************************* Procesing INPUT PARAMETERS: DF - decision forest model X - input vector, array[0..NVars-1]. OUTPUT PARAMETERS: Y - result. Regression estimate when solving regression task, vector of posterior probabilities for classification task. See also DFProcessI. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfprocess(decisionforest df, double[] x, ref double[] y) { dforest.dfprocess(df.innerobj, x, ref y); return; } /************************************************************************* 'interactive' variant of DFProcess for languages like Python which support constructs like "Y = DFProcessI(DF,X)" and interactive mode of interpreter This function allocates new array on each call, so it is significantly slower than its 'non-interactive' counterpart, but it is more convenient when you call it from command line. -- ALGLIB -- Copyright 28.02.2010 by Bochkanov Sergey *************************************************************************/ public static void dfprocessi(decisionforest df, double[] x, out double[] y) { y = new double[0]; dforest.dfprocessi(df.innerobj, x, ref y); return; } /************************************************************************* Relative classification error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: percent of incorrectly classified cases. Zero if model solves regression task. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfrelclserror(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfrelclserror(df.innerobj, xy, npoints); return result; } /************************************************************************* Average cross-entropy (in bits per element) on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: CrossEntropy/(NPoints*LN(2)). Zero if model solves regression task. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgce(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfavgce(df.innerobj, xy, npoints); return result; } /************************************************************************* RMS error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: root mean square error. Its meaning for regression task is obvious. As for classification task, RMS error means error when estimating posterior probabilities. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfrmserror(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfrmserror(df.innerobj, xy, npoints); return result; } /************************************************************************* Average error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: Its meaning for regression task is obvious. As for classification task, it means average error when estimating posterior probabilities. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgerror(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfavgerror(df.innerobj, xy, npoints); return result; } /************************************************************************* Average relative error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: Its meaning for regression task is obvious. As for classification task, it means average relative error when estimating posterior probability of belonging to the correct class. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgrelerror(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfavgrelerror(df.innerobj, xy, npoints); return result; } } public partial class alglib { /************************************************************************* Optimal binary classification Algorithms finds optimal (=with minimal cross-entropy) binary partition. Internal subroutine. INPUT PARAMETERS: A - array[0..N-1], variable C - array[0..N-1], class numbers (0 or 1). N - array size OUTPUT PARAMETERS: Info - completetion code: * -3, all values of A[] are same (partition is impossible) * -2, one of C[] is incorrect (<0, >1) * -1, incorrect pararemets were passed (N<=0). * 1, OK Threshold- partiton boundary. Left part contains values which are strictly less than Threshold. Right part contains values which are greater than or equal to Threshold. PAL, PBL- probabilities P(0|v=Threshold) and P(1|v>=Threshold) CVE - cross-validation estimate of cross-entropy -- ALGLIB -- Copyright 22.05.2008 by Bochkanov Sergey *************************************************************************/ public static void dsoptimalsplit2(double[] a, int[] c, int n, out int info, out double threshold, out double pal, out double pbl, out double par, out double pbr, out double cve) { info = 0; threshold = 0; pal = 0; pbl = 0; par = 0; pbr = 0; cve = 0; bdss.dsoptimalsplit2(a, c, n, ref info, ref threshold, ref pal, ref pbl, ref par, ref pbr, ref cve); return; } /************************************************************************* Optimal partition, internal subroutine. Fast version. Accepts: A array[0..N-1] array of attributes array[0..N-1] C array[0..N-1] array of class labels TiesBuf array[0..N] temporaries (ties) CntBuf array[0..2*NC-1] temporaries (counts) Alpha centering factor (0<=alpha<=1, recommended value - 0.05) BufR array[0..N-1] temporaries BufI array[0..N-1] temporaries Output: Info error code (">0"=OK, "<0"=bad) RMS training set RMS error CVRMS leave-one-out RMS error Note: content of all arrays is changed by subroutine; it doesn't allocate temporaries. -- ALGLIB -- Copyright 11.12.2008 by Bochkanov Sergey *************************************************************************/ public static void dsoptimalsplit2fast(ref double[] a, ref int[] c, ref int[] tiesbuf, ref int[] cntbuf, ref double[] bufr, ref int[] bufi, int n, int nc, double alpha, out int info, out double threshold, out double rms, out double cvrms) { info = 0; threshold = 0; rms = 0; cvrms = 0; bdss.dsoptimalsplit2fast(ref a, ref c, ref tiesbuf, ref cntbuf, ref bufr, ref bufi, n, nc, alpha, ref info, ref threshold, ref rms, ref cvrms); return; } } public partial class alglib { /************************************************************************* k-means++ clusterization INPUT PARAMETERS: XY - dataset, array [0..NPoints-1,0..NVars-1]. NPoints - dataset size, NPoints>=K NVars - number of variables, NVars>=1 K - desired number of clusters, K>=1 Restarts - number of restarts, Restarts>=1 OUTPUT PARAMETERS: Info - return code: * -3, if task is degenerate (number of distinct points is less than K) * -1, if incorrect NPoints/NFeatures/K/Restarts was passed * 1, if subroutine finished successfully C - array[0..NVars-1,0..K-1].matrix whose columns store cluster's centers XYC - array which contains number of clusters dataset points belong to. -- ALGLIB -- Copyright 21.03.2009 by Bochkanov Sergey *************************************************************************/ public static void kmeansgenerate(double[,] xy, int npoints, int nvars, int k, int restarts, out int info, out double[,] c, out int[] xyc) { info = 0; c = new double[0,0]; xyc = new int[0]; kmeans.kmeansgenerate(xy, npoints, nvars, k, restarts, ref info, ref c, ref xyc); return; } } public partial class alglib { /************************************************************************* Multiclass Fisher LDA Subroutine finds coefficients of linear combination which optimally separates training set on classes. INPUT PARAMETERS: XY - training set, array[0..NPoints-1,0..NVars]. First NVars columns store values of independent variables, next column stores number of class (from 0 to NClasses-1) which dataset element belongs to. Fractional values are rounded to nearest integer. NPoints - training set size, NPoints>=0 NVars - number of independent variables, NVars>=1 NClasses - number of classes, NClasses>=2 OUTPUT PARAMETERS: Info - return code: * -4, if internal EVD subroutine hasn't converged * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<0, NVars<1, NClasses<2) * 1, if task has been solved * 2, if there was a multicollinearity in training set, but task has been solved. W - linear combination coefficients, array[0..NVars-1] -- ALGLIB -- Copyright 31.05.2008 by Bochkanov Sergey *************************************************************************/ public static void fisherlda(double[,] xy, int npoints, int nvars, int nclasses, out int info, out double[] w) { info = 0; w = new double[0]; lda.fisherlda(xy, npoints, nvars, nclasses, ref info, ref w); return; } /************************************************************************* N-dimensional multiclass Fisher LDA Subroutine finds coefficients of linear combinations which optimally separates training set on classes. It returns N-dimensional basis whose vector are sorted by quality of training set separation (in descending order). INPUT PARAMETERS: XY - training set, array[0..NPoints-1,0..NVars]. First NVars columns store values of independent variables, next column stores number of class (from 0 to NClasses-1) which dataset element belongs to. Fractional values are rounded to nearest integer. NPoints - training set size, NPoints>=0 NVars - number of independent variables, NVars>=1 NClasses - number of classes, NClasses>=2 OUTPUT PARAMETERS: Info - return code: * -4, if internal EVD subroutine hasn't converged * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<0, NVars<1, NClasses<2) * 1, if task has been solved * 2, if there was a multicollinearity in training set, but task has been solved. W - basis, array[0..NVars-1,0..NVars-1] columns of matrix stores basis vectors, sorted by quality of training set separation (in descending order) -- ALGLIB -- Copyright 31.05.2008 by Bochkanov Sergey *************************************************************************/ public static void fisherldan(double[,] xy, int npoints, int nvars, int nclasses, out int info, out double[,] w) { info = 0; w = new double[0,0]; lda.fisherldan(xy, npoints, nvars, nclasses, ref info, ref w); return; } } public partial class alglib { /************************************************************************* *************************************************************************/ public class linearmodel { // // Public declarations // public linearmodel() { _innerobj = new linreg.linearmodel(); } // // Although some of declarations below are public, you should not use them // They are intended for internal use only // private linreg.linearmodel _innerobj; public linreg.linearmodel innerobj { get { return _innerobj; } } public linearmodel(linreg.linearmodel obj) { _innerobj = obj; } } /************************************************************************* LRReport structure contains additional information about linear model: * C - covariation matrix, array[0..NVars,0..NVars]. C[i,j] = Cov(A[i],A[j]) * RMSError - root mean square error on a training set * AvgError - average error on a training set * AvgRelError - average relative error on a training set (excluding observations with zero function value). * CVRMSError - leave-one-out cross-validation estimate of generalization error. Calculated using fast algorithm with O(NVars*NPoints) complexity. * CVAvgError - cross-validation estimate of average error * CVAvgRelError - cross-validation estimate of average relative error All other fields of the structure are intended for internal use and should not be used outside ALGLIB. *************************************************************************/ public class lrreport { // // Public declarations // public double[,] c { get { return _innerobj.c; } set { _innerobj.c = value; } } public double rmserror { get { return _innerobj.rmserror; } set { _innerobj.rmserror = value; } } public double avgerror { get { return _innerobj.avgerror; } set { _innerobj.avgerror = value; } } public double avgrelerror { get { return _innerobj.avgrelerror; } set { _innerobj.avgrelerror = value; } } public double cvrmserror { get { return _innerobj.cvrmserror; } set { _innerobj.cvrmserror = value; } } public double cvavgerror { get { return _innerobj.cvavgerror; } set { _innerobj.cvavgerror = value; } } public double cvavgrelerror { get { return _innerobj.cvavgrelerror; } set { _innerobj.cvavgrelerror = value; } } public int ncvdefects { get { return _innerobj.ncvdefects; } set { _innerobj.ncvdefects = value; } } public int[] cvdefects { get { return _innerobj.cvdefects; } set { _innerobj.cvdefects = value; } } public lrreport() { _innerobj = new linreg.lrreport(); } // // Although some of declarations below are public, you should not use them // They are intended for internal use only // private linreg.lrreport _innerobj; public linreg.lrreport innerobj { get { return _innerobj; } } public lrreport(linreg.lrreport obj) { _innerobj = obj; } } /************************************************************************* Linear regression Subroutine builds model: Y = A(0)*X[0] + ... + A(N-1)*X[N-1] + A(N) and model found in ALGLIB format, covariation matrix, training set errors (rms, average, average relative) and leave-one-out cross-validation estimate of the generalization error. CV estimate calculated using fast algorithm with O(NPoints*NVars) complexity. When covariation matrix is calculated standard deviations of function values are assumed to be equal to RMS error on the training set. INPUT PARAMETERS: XY - training set, array [0..NPoints-1,0..NVars]: * NVars columns - independent variables * last column - dependent variable NPoints - training set size, NPoints>NVars+1 NVars - number of independent variables OUTPUT PARAMETERS: Info - return code: * -255, in case of unknown internal error * -4, if internal SVD subroutine haven't converged * -1, if incorrect parameters was passed (NPoints0. NPoints - training set size, NPoints>NVars+1 NVars - number of independent variables OUTPUT PARAMETERS: Info - return code: * -255, in case of unknown internal error * -4, if internal SVD subroutine haven't converged * -1, if incorrect parameters was passed (NPoints=1 NVars - number of independent variables, NVars>=1 NClasses - number of classes, NClasses>=2 OUTPUT PARAMETERS: Info - return code: * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints