/************************************************************************* Copyright (c) Sergey Bochkanov (ALGLIB project). >>> SOURCE LICENSE >>> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation (www.fsf.org); either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. A copy of the GNU General Public License is available at http://www.fsf.org/licensing/licenses >>> END OF LICENSE >>> *************************************************************************/ #pragma warning disable 162 #pragma warning disable 219 using System; public partial class alglib { /************************************************************************* Optimal binary classification Algorithms finds optimal (=with minimal cross-entropy) binary partition. Internal subroutine. INPUT PARAMETERS: A - array[0..N-1], variable C - array[0..N-1], class numbers (0 or 1). N - array size OUTPUT PARAMETERS: Info - completetion code: * -3, all values of A[] are same (partition is impossible) * -2, one of C[] is incorrect (<0, >1) * -1, incorrect pararemets were passed (N<=0). * 1, OK Threshold- partiton boundary. Left part contains values which are strictly less than Threshold. Right part contains values which are greater than or equal to Threshold. PAL, PBL- probabilities P(0|v=Threshold) and P(1|v>=Threshold) CVE - cross-validation estimate of cross-entropy -- ALGLIB -- Copyright 22.05.2008 by Bochkanov Sergey *************************************************************************/ public static void dsoptimalsplit2(double[] a, int[] c, int n, out int info, out double threshold, out double pal, out double pbl, out double par, out double pbr, out double cve) { info = 0; threshold = 0; pal = 0; pbl = 0; par = 0; pbr = 0; cve = 0; bdss.dsoptimalsplit2(a, c, n, ref info, ref threshold, ref pal, ref pbl, ref par, ref pbr, ref cve); return; } /************************************************************************* Optimal partition, internal subroutine. Fast version. Accepts: A array[0..N-1] array of attributes array[0..N-1] C array[0..N-1] array of class labels TiesBuf array[0..N] temporaries (ties) CntBuf array[0..2*NC-1] temporaries (counts) Alpha centering factor (0<=alpha<=1, recommended value - 0.05) BufR array[0..N-1] temporaries BufI array[0..N-1] temporaries Output: Info error code (">0"=OK, "<0"=bad) RMS training set RMS error CVRMS leave-one-out RMS error Note: content of all arrays is changed by subroutine; it doesn't allocate temporaries. -- ALGLIB -- Copyright 11.12.2008 by Bochkanov Sergey *************************************************************************/ public static void dsoptimalsplit2fast(ref double[] a, ref int[] c, ref int[] tiesbuf, ref int[] cntbuf, ref double[] bufr, ref int[] bufi, int n, int nc, double alpha, out int info, out double threshold, out double rms, out double cvrms) { info = 0; threshold = 0; rms = 0; cvrms = 0; bdss.dsoptimalsplit2fast(ref a, ref c, ref tiesbuf, ref cntbuf, ref bufr, ref bufi, n, nc, alpha, ref info, ref threshold, ref rms, ref cvrms); return; } } public partial class alglib { /************************************************************************* *************************************************************************/ public class decisionforest { // // Public declarations // public decisionforest() { _innerobj = new dforest.decisionforest(); } // // Although some of declarations below are public, you should not use them // They are intended for internal use only // private dforest.decisionforest _innerobj; public dforest.decisionforest innerobj { get { return _innerobj; } } public decisionforest(dforest.decisionforest obj) { _innerobj = obj; } } /************************************************************************* *************************************************************************/ public class dfreport { // // Public declarations // public double relclserror { get { return _innerobj.relclserror; } set { _innerobj.relclserror = value; } } public double avgce { get { return _innerobj.avgce; } set { _innerobj.avgce = value; } } public double rmserror { get { return _innerobj.rmserror; } set { _innerobj.rmserror = value; } } public double avgerror { get { return _innerobj.avgerror; } set { _innerobj.avgerror = value; } } public double avgrelerror { get { return _innerobj.avgrelerror; } set { _innerobj.avgrelerror = value; } } public double oobrelclserror { get { return _innerobj.oobrelclserror; } set { _innerobj.oobrelclserror = value; } } public double oobavgce { get { return _innerobj.oobavgce; } set { _innerobj.oobavgce = value; } } public double oobrmserror { get { return _innerobj.oobrmserror; } set { _innerobj.oobrmserror = value; } } public double oobavgerror { get { return _innerobj.oobavgerror; } set { _innerobj.oobavgerror = value; } } public double oobavgrelerror { get { return _innerobj.oobavgrelerror; } set { _innerobj.oobavgrelerror = value; } } public dfreport() { _innerobj = new dforest.dfreport(); } // // Although some of declarations below are public, you should not use them // They are intended for internal use only // private dforest.dfreport _innerobj; public dforest.dfreport innerobj { get { return _innerobj; } } public dfreport(dforest.dfreport obj) { _innerobj = obj; } } /************************************************************************* This function serializes data structure to string. Important properties of s_out: * it contains alphanumeric characters, dots, underscores, minus signs * these symbols are grouped into words, which are separated by spaces and Windows-style (CR+LF) newlines * although serializer uses spaces and CR+LF as separators, you can replace any separator character by arbitrary combination of spaces, tabs, Windows or Unix newlines. It allows flexible reformatting of the string in case you want to include it into text or XML file. But you should not insert separators into the middle of the "words" nor you should change case of letters. * s_out can be freely moved between 32-bit and 64-bit systems, little and big endian machines, and so on. You can serialize structure on 32-bit machine and unserialize it on 64-bit one (or vice versa), or serialize it on SPARC and unserialize on x86. You can also serialize it in C# version of ALGLIB and unserialize in C++ one, and vice versa. *************************************************************************/ public static void dfserialize(decisionforest obj, out string s_out) { alglib.serializer s = new alglib.serializer(); s.alloc_start(); dforest.dfalloc(s, obj.innerobj); s.sstart_str(); dforest.dfserialize(s, obj.innerobj); s.stop(); s_out = s.get_string(); } /************************************************************************* This function unserializes data structure from string. *************************************************************************/ public static void dfunserialize(string s_in, out decisionforest obj) { alglib.serializer s = new alglib.serializer(); obj = new decisionforest(); s.ustart_str(s_in); dforest.dfunserialize(s, obj.innerobj); s.stop(); } /************************************************************************* This subroutine builds random decision forest. INPUT PARAMETERS: XY - training set NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - task type: * NClasses=1 - regression task with one dependent variable * NClasses>1 - classification task with NClasses classes. NTrees - number of trees in a forest, NTrees>=1. recommended values: 50-100. R - percent of a training set used to build individual trees. 01). * 1, if task has been solved DF - model built Rep - training report, contains error on a training set and out-of-bag estimates of generalization error. -- ALGLIB -- Copyright 19.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfbuildrandomdecisionforest(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, double r, out int info, out decisionforest df, out dfreport rep) { info = 0; df = new decisionforest(); rep = new dfreport(); dforest.dfbuildrandomdecisionforest(xy, npoints, nvars, nclasses, ntrees, r, ref info, df.innerobj, rep.innerobj); return; } /************************************************************************* This subroutine builds random decision forest. This function gives ability to tune number of variables used when choosing best split. INPUT PARAMETERS: XY - training set NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - task type: * NClasses=1 - regression task with one dependent variable * NClasses>1 - classification task with NClasses classes. NTrees - number of trees in a forest, NTrees>=1. recommended values: 50-100. NRndVars - number of variables used when choosing best split R - percent of a training set used to build individual trees. 01). * 1, if task has been solved DF - model built Rep - training report, contains error on a training set and out-of-bag estimates of generalization error. -- ALGLIB -- Copyright 19.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfbuildrandomdecisionforestx1(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, int nrndvars, double r, out int info, out decisionforest df, out dfreport rep) { info = 0; df = new decisionforest(); rep = new dfreport(); dforest.dfbuildrandomdecisionforestx1(xy, npoints, nvars, nclasses, ntrees, nrndvars, r, ref info, df.innerobj, rep.innerobj); return; } /************************************************************************* Procesing INPUT PARAMETERS: DF - decision forest model X - input vector, array[0..NVars-1]. OUTPUT PARAMETERS: Y - result. Regression estimate when solving regression task, vector of posterior probabilities for classification task. See also DFProcessI. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfprocess(decisionforest df, double[] x, ref double[] y) { dforest.dfprocess(df.innerobj, x, ref y); return; } /************************************************************************* 'interactive' variant of DFProcess for languages like Python which support constructs like "Y = DFProcessI(DF,X)" and interactive mode of interpreter This function allocates new array on each call, so it is significantly slower than its 'non-interactive' counterpart, but it is more convenient when you call it from command line. -- ALGLIB -- Copyright 28.02.2010 by Bochkanov Sergey *************************************************************************/ public static void dfprocessi(decisionforest df, double[] x, out double[] y) { y = new double[0]; dforest.dfprocessi(df.innerobj, x, ref y); return; } /************************************************************************* Relative classification error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: percent of incorrectly classified cases. Zero if model solves regression task. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfrelclserror(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfrelclserror(df.innerobj, xy, npoints); return result; } /************************************************************************* Average cross-entropy (in bits per element) on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: CrossEntropy/(NPoints*LN(2)). Zero if model solves regression task. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgce(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfavgce(df.innerobj, xy, npoints); return result; } /************************************************************************* RMS error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: root mean square error. Its meaning for regression task is obvious. As for classification task, RMS error means error when estimating posterior probabilities. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfrmserror(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfrmserror(df.innerobj, xy, npoints); return result; } /************************************************************************* Average error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: Its meaning for regression task is obvious. As for classification task, it means average error when estimating posterior probabilities. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgerror(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfavgerror(df.innerobj, xy, npoints); return result; } /************************************************************************* Average relative error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: Its meaning for regression task is obvious. As for classification task, it means average relative error when estimating posterior probability of belonging to the correct class. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgrelerror(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfavgrelerror(df.innerobj, xy, npoints); return result; } } public partial class alglib { /************************************************************************* k-means++ clusterization INPUT PARAMETERS: XY - dataset, array [0..NPoints-1,0..NVars-1]. NPoints - dataset size, NPoints>=K NVars - number of variables, NVars>=1 K - desired number of clusters, K>=1 Restarts - number of restarts, Restarts>=1 OUTPUT PARAMETERS: Info - return code: * -3, if task is degenerate (number of distinct points is less than K) * -1, if incorrect NPoints/NFeatures/K/Restarts was passed * 1, if subroutine finished successfully C - array[0..NVars-1,0..K-1].matrix whose columns store cluster's centers XYC - array[NPoints], which contains cluster indexes -- ALGLIB -- Copyright 21.03.2009 by Bochkanov Sergey *************************************************************************/ public static void kmeansgenerate(double[,] xy, int npoints, int nvars, int k, int restarts, out int info, out double[,] c, out int[] xyc) { info = 0; c = new double[0,0]; xyc = new int[0]; kmeans.kmeansgenerate(xy, npoints, nvars, k, restarts, ref info, ref c, ref xyc); return; } } public partial class alglib { /************************************************************************* Multiclass Fisher LDA Subroutine finds coefficients of linear combination which optimally separates training set on classes. INPUT PARAMETERS: XY - training set, array[0..NPoints-1,0..NVars]. First NVars columns store values of independent variables, next column stores number of class (from 0 to NClasses-1) which dataset element belongs to. Fractional values are rounded to nearest integer. NPoints - training set size, NPoints>=0 NVars - number of independent variables, NVars>=1 NClasses - number of classes, NClasses>=2 OUTPUT PARAMETERS: Info - return code: * -4, if internal EVD subroutine hasn't converged * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<0, NVars<1, NClasses<2) * 1, if task has been solved * 2, if there was a multicollinearity in training set, but task has been solved. W - linear combination coefficients, array[0..NVars-1] -- ALGLIB -- Copyright 31.05.2008 by Bochkanov Sergey *************************************************************************/ public static void fisherlda(double[,] xy, int npoints, int nvars, int nclasses, out int info, out double[] w) { info = 0; w = new double[0]; lda.fisherlda(xy, npoints, nvars, nclasses, ref info, ref w); return; } /************************************************************************* N-dimensional multiclass Fisher LDA Subroutine finds coefficients of linear combinations which optimally separates training set on classes. It returns N-dimensional basis whose vector are sorted by quality of training set separation (in descending order). INPUT PARAMETERS: XY - training set, array[0..NPoints-1,0..NVars]. First NVars columns store values of independent variables, next column stores number of class (from 0 to NClasses-1) which dataset element belongs to. Fractional values are rounded to nearest integer. NPoints - training set size, NPoints>=0 NVars - number of independent variables, NVars>=1 NClasses - number of classes, NClasses>=2 OUTPUT PARAMETERS: Info - return code: * -4, if internal EVD subroutine hasn't converged * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<0, NVars<1, NClasses<2) * 1, if task has been solved * 2, if there was a multicollinearity in training set, but task has been solved. W - basis, array[0..NVars-1,0..NVars-1] columns of matrix stores basis vectors, sorted by quality of training set separation (in descending order) -- ALGLIB -- Copyright 31.05.2008 by Bochkanov Sergey *************************************************************************/ public static void fisherldan(double[,] xy, int npoints, int nvars, int nclasses, out int info, out double[,] w) { info = 0; w = new double[0,0]; lda.fisherldan(xy, npoints, nvars, nclasses, ref info, ref w); return; } } public partial class alglib { /************************************************************************* *************************************************************************/ public class linearmodel { // // Public declarations // public linearmodel() { _innerobj = new linreg.linearmodel(); } // // Although some of declarations below are public, you should not use them // They are intended for internal use only // private linreg.linearmodel _innerobj; public linreg.linearmodel innerobj { get { return _innerobj; } } public linearmodel(linreg.linearmodel obj) { _innerobj = obj; } } /************************************************************************* LRReport structure contains additional information about linear model: * C - covariation matrix, array[0..NVars,0..NVars]. C[i,j] = Cov(A[i],A[j]) * RMSError - root mean square error on a training set * AvgError - average error on a training set * AvgRelError - average relative error on a training set (excluding observations with zero function value). * CVRMSError - leave-one-out cross-validation estimate of generalization error. Calculated using fast algorithm with O(NVars*NPoints) complexity. * CVAvgError - cross-validation estimate of average error * CVAvgRelError - cross-validation estimate of average relative error All other fields of the structure are intended for internal use and should not be used outside ALGLIB. *************************************************************************/ public class lrreport { // // Public declarations // public double[,] c { get { return _innerobj.c; } set { _innerobj.c = value; } } public double rmserror { get { return _innerobj.rmserror; } set { _innerobj.rmserror = value; } } public double avgerror { get { return _innerobj.avgerror; } set { _innerobj.avgerror = value; } } public double avgrelerror { get { return _innerobj.avgrelerror; } set { _innerobj.avgrelerror = value; } } public double cvrmserror { get { return _innerobj.cvrmserror; } set { _innerobj.cvrmserror = value; } } public double cvavgerror { get { return _innerobj.cvavgerror; } set { _innerobj.cvavgerror = value; } } public double cvavgrelerror { get { return _innerobj.cvavgrelerror; } set { _innerobj.cvavgrelerror = value; } } public int ncvdefects { get { return _innerobj.ncvdefects; } set { _innerobj.ncvdefects = value; } } public int[] cvdefects { get { return _innerobj.cvdefects; } set { _innerobj.cvdefects = value; } } public lrreport() { _innerobj = new linreg.lrreport(); } // // Although some of declarations below are public, you should not use them // They are intended for internal use only // private linreg.lrreport _innerobj; public linreg.lrreport innerobj { get { return _innerobj; } } public lrreport(linreg.lrreport obj) { _innerobj = obj; } } /************************************************************************* Linear regression Subroutine builds model: Y = A(0)*X[0] + ... + A(N-1)*X[N-1] + A(N) and model found in ALGLIB format, covariation matrix, training set errors (rms, average, average relative) and leave-one-out cross-validation estimate of the generalization error. CV estimate calculated using fast algorithm with O(NPoints*NVars) complexity. When covariation matrix is calculated standard deviations of function values are assumed to be equal to RMS error on the training set. INPUT PARAMETERS: XY - training set, array [0..NPoints-1,0..NVars]: * NVars columns - independent variables * last column - dependent variable NPoints - training set size, NPoints>NVars+1 NVars - number of independent variables OUTPUT PARAMETERS: Info - return code: * -255, in case of unknown internal error * -4, if internal SVD subroutine haven't converged * -1, if incorrect parameters was passed (NPoints0. NPoints - training set size, NPoints>NVars+1 NVars - number of independent variables OUTPUT PARAMETERS: Info - return code: * -255, in case of unknown internal error * -4, if internal SVD subroutine haven't converged * -1, if incorrect parameters was passed (NPoints