Context Navigation

source: trunk/sources/HeuristicLab.ExtLibs/HeuristicLab.ALGLIB/2.5.0/ALGLIB-2.5.0/dforest.cs @ 3932

Visit:

Last change on this file since 3932 was 3839, checked in by mkommend, 15 years ago
implemented first version of LR (ticket #1012)
File size: 63.8 KB

Line
1	/*************************************************************************
2	Copyright (c) 2009, Sergey Bochkanov (ALGLIB project).
3
4	>>> SOURCE LICENSE >>>
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation (www.fsf.org); either version 2 of the
8	License, or (at your option) any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	A copy of the GNU General Public License is available at
16	http://www.fsf.org/licensing/licenses
17
18	>>> END OF LICENSE >>>
19	*************************************************************************/
20
21	using System;
22
23	namespace alglib
24	{
25	public class dforest
26	{
27	public struct decisionforest
28	{
29	public int nvars;
30	public int nclasses;
31	public int ntrees;
32	public int bufsize;
33	public double[] trees;
34	};
35
36
37	public struct dfreport
38	{
39	public double relclserror;
40	public double avgce;
41	public double rmserror;
42	public double avgerror;
43	public double avgrelerror;
44	public double oobrelclserror;
45	public double oobavgce;
46	public double oobrmserror;
47	public double oobavgerror;
48	public double oobavgrelerror;
49	};
50
51
52	public struct dfinternalbuffers
53	{
54	public double[] treebuf;
55	public int[] idxbuf;
56	public double[] tmpbufr;
57	public double[] tmpbufr2;
58	public int[] tmpbufi;
59	public int[] classibuf;
60	public int[] varpool;
61	public bool[] evsbin;
62	public double[] evssplits;
63	};
64
65
66
67
68	public const int dfvnum = 8;
69	public const int innernodewidth = 3;
70	public const int leafnodewidth = 2;
71	public const int dfusestrongsplits = 1;
72	public const int dfuseevs = 2;
73
74
75	/*************************************************************************
76	This subroutine builds random decision forest.
77
78	INPUT PARAMETERS:
79	XY - training set
80	NPoints - training set size, NPoints>=1
81	NVars - number of independent variables, NVars>=1
82	NClasses - task type:
83	* NClasses=1 - regression task with one
84	dependent variable
85	* NClasses>1 - classification task with
86	NClasses classes.
87	NTrees - number of trees in a forest, NTrees>=1.
88	recommended values: 50-100.
89	R - percent of a training set used to build
90	individual trees. 0<R<=1.
91	recommended values: 0.1 <= R <= 0.66.
92
93	OUTPUT PARAMETERS:
94	Info - return code:
95	* -2, if there is a point with class number
96	outside of [0..NClasses-1].
97	* -1, if incorrect parameters was passed
98	(NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0
99	or R>1).
100	* 1, if task has been solved
101	DF - model built
102	Rep - training report, contains error on a training set
103	and out-of-bag estimates of generalization error.
104
105	-- ALGLIB --
106	Copyright 19.02.2009 by Bochkanov Sergey
107	*************************************************************************/
108	public static void dfbuildrandomdecisionforest(ref double[,] xy,
109	int npoints,
110	int nvars,
111	int nclasses,
112	int ntrees,
113	double r,
114	ref int info,
115	ref decisionforest df,
116	ref dfreport rep)
117	{
118	int samplesize = 0;
119
120	if( (double)(r)<=(double)(0) \| (double)(r)>(double)(1) )
121	{
122	info = -1;
123	return;
124	}
125	samplesize = Math.Max((int)Math.Round(r*npoints), 1);
126	dfbuildinternal(ref xy, npoints, nvars, nclasses, ntrees, samplesize, Math.Max(nvars/2, 1), dfusestrongsplits+dfuseevs, ref info, ref df, ref rep);
127	}
128
129
130	public static void dfbuildinternal(ref double[,] xy,
131	int npoints,
132	int nvars,
133	int nclasses,
134	int ntrees,
135	int samplesize,
136	int nfeatures,
137	int flags,
138	ref int info,
139	ref decisionforest df,
140	ref dfreport rep)
141	{
142	int i = 0;
143	int j = 0;
144	int k = 0;
145	int tmpi = 0;
146	int lasttreeoffs = 0;
147	int offs = 0;
148	int ooboffs = 0;
149	int treesize = 0;
150	int nvarsinpool = 0;
151	bool useevs = new bool();
152	dfinternalbuffers bufs = new dfinternalbuffers();
153	int[] permbuf = new int[0];
154	double[] oobbuf = new double[0];
155	int[] oobcntbuf = new int[0];
156	double[,] xys = new double[0,0];
157	double[] x = new double[0];
158	double[] y = new double[0];
159	int oobcnt = 0;
160	int oobrelcnt = 0;
161	double v = 0;
162	double vmin = 0;
163	double vmax = 0;
164	bool bflag = new bool();
165	int i_ = 0;
166	int i1_ = 0;
167
168
169	//
170	// Test for inputs
171	//
172	if( npoints<1 \| samplesize<1 \| samplesize>npoints \| nvars<1 \| nclasses<1 \| ntrees<1 \| nfeatures<1 )
173	{
174	info = -1;
175	return;
176	}
177	if( nclasses>1 )
178	{
179	for(i=0; i<=npoints-1; i++)
180	{
181	if( (int)Math.Round(xy[i,nvars])<0 \| (int)Math.Round(xy[i,nvars])>=nclasses )
182	{
183	info = -2;
184	return;
185	}
186	}
187	}
188	info = 1;
189
190	//
191	// Flags
192	//
193	useevs = flags/dfuseevs%2!=0;
194
195	//
196	// Allocate data, prepare header
197	//
198	treesize = 1+innernodewidth(samplesize-1)+leafnodewidthsamplesize;
199	permbuf = new int[npoints-1+1];
200	bufs.treebuf = new double[treesize-1+1];
201	bufs.idxbuf = new int[npoints-1+1];
202	bufs.tmpbufr = new double[npoints-1+1];
203	bufs.tmpbufr2 = new double[npoints-1+1];
204	bufs.tmpbufi = new int[npoints-1+1];
205	bufs.varpool = new int[nvars-1+1];
206	bufs.evsbin = new bool[nvars-1+1];
207	bufs.evssplits = new double[nvars-1+1];
208	bufs.classibuf = new int[2*nclasses-1+1];
209	oobbuf = new double[nclasses*npoints-1+1];
210	oobcntbuf = new int[npoints-1+1];
211	df.trees = new double[ntrees*treesize-1+1];
212	xys = new double[samplesize-1+1, nvars+1];
213	x = new double[nvars-1+1];
214	y = new double[nclasses-1+1];
215	for(i=0; i<=npoints-1; i++)
216	{
217	permbuf[i] = i;
218	}
219	for(i=0; i<=npoints*nclasses-1; i++)
220	{
221	oobbuf[i] = 0;
222	}
223	for(i=0; i<=npoints-1; i++)
224	{
225	oobcntbuf[i] = 0;
226	}
227
228	//
229	// Prepare variable pool and EVS (extended variable selection/splitting) buffers
230	// (whether EVS is turned on or not):
231	// 1. detect binary variables and pre-calculate splits for them
232	// 2. detect variables with non-distinct values and exclude them from pool
233	//
234	for(i=0; i<=nvars-1; i++)
235	{
236	bufs.varpool[i] = i;
237	}
238	nvarsinpool = nvars;
239	if( useevs )
240	{
241	for(j=0; j<=nvars-1; j++)
242	{
243	vmin = xy[0,j];
244	vmax = vmin;
245	for(i=0; i<=npoints-1; i++)
246	{
247	v = xy[i,j];
248	vmin = Math.Min(vmin, v);
249	vmax = Math.Max(vmax, v);
250	}
251	if( (double)(vmin)==(double)(vmax) )
252	{
253
254	//
255	// exclude variable from pool
256	//
257	bufs.varpool[j] = bufs.varpool[nvarsinpool-1];
258	bufs.varpool[nvarsinpool-1] = -1;
259	nvarsinpool = nvarsinpool-1;
260	continue;
261	}
262	bflag = false;
263	for(i=0; i<=npoints-1; i++)
264	{
265	v = xy[i,j];
266	if( (double)(v)!=(double)(vmin) & (double)(v)!=(double)(vmax) )
267	{
268	bflag = true;
269	break;
270	}
271	}
272	if( bflag )
273	{
274
275	//
276	// non-binary variable
277	//
278	bufs.evsbin[j] = false;
279	}
280	else
281	{
282
283	//
284	// Prepare
285	//
286	bufs.evsbin[j] = true;
287	bufs.evssplits[j] = 0.5*(vmin+vmax);
288	if( (double)(bufs.evssplits[j])<=(double)(vmin) )
289	{
290	bufs.evssplits[j] = vmax;
291	}
292	}
293	}
294	}
295
296	//
297	// RANDOM FOREST FORMAT
298	// W[0] - size of array
299	// W[1] - version number
300	// W[2] - NVars
301	// W[3] - NClasses (1 for regression)
302	// W[4] - NTrees
303	// W[5] - trees offset
304	//
305	//
306	// TREE FORMAT
307	// W[Offs] - size of sub-array
308	// node info:
309	// W[K+0] - variable number (-1 for leaf mode)
310	// W[K+1] - threshold (class/value for leaf node)
311	// W[K+2] - ">=" branch index (absent for leaf node)
312	//
313	//
314	df.nvars = nvars;
315	df.nclasses = nclasses;
316	df.ntrees = ntrees;
317
318	//
319	// Build forest
320	//
321	offs = 0;
322	for(i=0; i<=ntrees-1; i++)
323	{
324
325	//
326	// Prepare sample
327	//
328	for(k=0; k<=samplesize-1; k++)
329	{
330	j = k+AP.Math.RandomInteger(npoints-k);
331	tmpi = permbuf[k];
332	permbuf[k] = permbuf[j];
333	permbuf[j] = tmpi;
334	j = permbuf[k];
335	for(i_=0; i_<=nvars;i_++)
336	{
337	xys[k,i_] = xy[j,i_];
338	}
339	}
340
341	//
342	// build tree, copy
343	//
344	dfbuildtree(ref xys, samplesize, nvars, nclasses, nfeatures, nvarsinpool, flags, ref bufs);
345	j = (int)Math.Round(bufs.treebuf[0]);
346	i1_ = (0) - (offs);
347	for(i_=offs; i_<=offs+j-1;i_++)
348	{
349	df.trees[i_] = bufs.treebuf[i_+i1_];
350	}
351	lasttreeoffs = offs;
352	offs = offs+j;
353
354	//
355	// OOB estimates
356	//
357	for(k=samplesize; k<=npoints-1; k++)
358	{
359	for(j=0; j<=nclasses-1; j++)
360	{
361	y[j] = 0;
362	}
363	j = permbuf[k];
364	for(i_=0; i_<=nvars-1;i_++)
365	{
366	x[i_] = xy[j,i_];
367	}
368	dfprocessinternal(ref df, lasttreeoffs, ref x, ref y);
369	i1_ = (0) - (j*nclasses);
370	for(i_=jnclasses; i_<=(j+1)nclasses-1;i_++)
371	{
372	oobbuf[i_] = oobbuf[i_] + y[i_+i1_];
373	}
374	oobcntbuf[j] = oobcntbuf[j]+1;
375	}
376	}
377	df.bufsize = offs;
378
379	//
380	// Normalize OOB results
381	//
382	for(i=0; i<=npoints-1; i++)
383	{
384	if( oobcntbuf[i]!=0 )
385	{
386	v = (double)(1)/(double)(oobcntbuf[i]);
387	for(i_=inclasses; i_<=inclasses+nclasses-1;i_++)
388	{
389	oobbuf[i_] = v*oobbuf[i_];
390	}
391	}
392	}
393
394	//
395	// Calculate training set estimates
396	//
397	rep.relclserror = dfrelclserror(ref df, ref xy, npoints);
398	rep.avgce = dfavgce(ref df, ref xy, npoints);
399	rep.rmserror = dfrmserror(ref df, ref xy, npoints);
400	rep.avgerror = dfavgerror(ref df, ref xy, npoints);
401	rep.avgrelerror = dfavgrelerror(ref df, ref xy, npoints);
402
403	//
404	// Calculate OOB estimates.
405	//
406	rep.oobrelclserror = 0;
407	rep.oobavgce = 0;
408	rep.oobrmserror = 0;
409	rep.oobavgerror = 0;
410	rep.oobavgrelerror = 0;
411	oobcnt = 0;
412	oobrelcnt = 0;
413	for(i=0; i<=npoints-1; i++)
414	{
415	if( oobcntbuf[i]!=0 )
416	{
417	ooboffs = i*nclasses;
418	if( nclasses>1 )
419	{
420
421	//
422	// classification-specific code
423	//
424	k = (int)Math.Round(xy[i,nvars]);
425	tmpi = 0;
426	for(j=1; j<=nclasses-1; j++)
427	{
428	if( (double)(oobbuf[ooboffs+j])>(double)(oobbuf[ooboffs+tmpi]) )
429	{
430	tmpi = j;
431	}
432	}
433	if( tmpi!=k )
434	{
435	rep.oobrelclserror = rep.oobrelclserror+1;
436	}
437	if( (double)(oobbuf[ooboffs+k])!=(double)(0) )
438	{
439	rep.oobavgce = rep.oobavgce-Math.Log(oobbuf[ooboffs+k]);
440	}
441	else
442	{
443	rep.oobavgce = rep.oobavgce-Math.Log(AP.Math.MinRealNumber);
444	}
445	for(j=0; j<=nclasses-1; j++)
446	{
447	if( j==k )
448	{
449	rep.oobrmserror = rep.oobrmserror+AP.Math.Sqr(oobbuf[ooboffs+j]-1);
450	rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]-1);
451	rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs(oobbuf[ooboffs+j]-1);
452	oobrelcnt = oobrelcnt+1;
453	}
454	else
455	{
456	rep.oobrmserror = rep.oobrmserror+AP.Math.Sqr(oobbuf[ooboffs+j]);
457	rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]);
458	}
459	}
460	}
461	else
462	{
463
464	//
465	// regression-specific code
466	//
467	rep.oobrmserror = rep.oobrmserror+AP.Math.Sqr(oobbuf[ooboffs]-xy[i,nvars]);
468	rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs]-xy[i,nvars]);
469	if( (double)(xy[i,nvars])!=(double)(0) )
470	{
471	rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs((oobbuf[ooboffs]-xy[i,nvars])/xy[i,nvars]);
472	oobrelcnt = oobrelcnt+1;
473	}
474	}
475
476	//
477	// update OOB estimates count.
478	//
479	oobcnt = oobcnt+1;
480	}
481	}
482	if( oobcnt>0 )
483	{
484	rep.oobrelclserror = rep.oobrelclserror/oobcnt;
485	rep.oobavgce = rep.oobavgce/oobcnt;
486	rep.oobrmserror = Math.Sqrt(rep.oobrmserror/(oobcnt*nclasses));
487	rep.oobavgerror = rep.oobavgerror/(oobcnt*nclasses);
488	if( oobrelcnt>0 )
489	{
490	rep.oobavgrelerror = rep.oobavgrelerror/oobrelcnt;
491	}
492	}
493	}
494
495
496	/*************************************************************************
497	Procesing
498
499	INPUT PARAMETERS:
500	DF - decision forest model
501	X - input vector, array[0..NVars-1].
502
503	OUTPUT PARAMETERS:
504	Y - result. Regression estimate when solving regression task,
505	vector of posterior probabilities for classification task.
506	Subroutine does not allocate memory for this vector, it is
507	responsibility of a caller to allocate it. Array must be
508	at least [0..NClasses-1].
509
510	-- ALGLIB --
511	Copyright 16.02.2009 by Bochkanov Sergey
512	*************************************************************************/
513	public static void dfprocess(ref decisionforest df,
514	ref double[] x,
515	ref double[] y)
516	{
517	int offs = 0;
518	int i = 0;
519	double v = 0;
520	int i_ = 0;
521
522
523	//
524	// Proceed
525	//
526	offs = 0;
527	for(i=0; i<=df.nclasses-1; i++)
528	{
529	y[i] = 0;
530	}
531	for(i=0; i<=df.ntrees-1; i++)
532	{
533
534	//
535	// Process basic tree
536	//
537	dfprocessinternal(ref df, offs, ref x, ref y);
538
539	//
540	// Next tree
541	//
542	offs = offs+(int)Math.Round(df.trees[offs]);
543	}
544	v = (double)(1)/(double)(df.ntrees);
545	for(i_=0; i_<=df.nclasses-1;i_++)
546	{
547	y[i_] = v*y[i_];
548	}
549	}
550
551
552	/*************************************************************************
553	Relative classification error on the test set
554
555	INPUT PARAMETERS:
556	DF - decision forest model
557	XY - test set
558	NPoints - test set size
559
560	RESULT:
561	percent of incorrectly classified cases.
562	Zero if model solves regression task.
563
564	-- ALGLIB --
565	Copyright 16.02.2009 by Bochkanov Sergey
566	*************************************************************************/
567	public static double dfrelclserror(ref decisionforest df,
568	ref double[,] xy,
569	int npoints)
570	{
571	double result = 0;
572
573	result = (double)(dfclserror(ref df, ref xy, npoints))/(double)(npoints);
574	return result;
575	}
576
577
578	/*************************************************************************
579	Average cross-entropy (in bits per element) on the test set
580
581	INPUT PARAMETERS:
582	DF - decision forest model
583	XY - test set
584	NPoints - test set size
585
586	RESULT:
587	CrossEntropy/(NPoints*LN(2)).
588	Zero if model solves regression task.
589
590	-- ALGLIB --
591	Copyright 16.02.2009 by Bochkanov Sergey
592	*************************************************************************/
593	public static double dfavgce(ref decisionforest df,
594	ref double[,] xy,
595	int npoints)
596	{
597	double result = 0;
598	double[] x = new double[0];
599	double[] y = new double[0];
600	int i = 0;
601	int j = 0;
602	int k = 0;
603	int tmpi = 0;
604	int i_ = 0;
605
606	x = new double[df.nvars-1+1];
607	y = new double[df.nclasses-1+1];
608	result = 0;
609	for(i=0; i<=npoints-1; i++)
610	{
611	for(i_=0; i_<=df.nvars-1;i_++)
612	{
613	x[i_] = xy[i,i_];
614	}
615	dfprocess(ref df, ref x, ref y);
616	if( df.nclasses>1 )
617	{
618
619	//
620	// classification-specific code
621	//
622	k = (int)Math.Round(xy[i,df.nvars]);
623	tmpi = 0;
624	for(j=1; j<=df.nclasses-1; j++)
625	{
626	if( (double)(y[j])>(double)(y[tmpi]) )
627	{
628	tmpi = j;
629	}
630	}
631	if( (double)(y[k])!=(double)(0) )
632	{
633	result = result-Math.Log(y[k]);
634	}
635	else
636	{
637	result = result-Math.Log(AP.Math.MinRealNumber);
638	}
639	}
640	}
641	result = result/npoints;
642	return result;
643	}
644
645
646	/*************************************************************************
647	RMS error on the test set
648
649	INPUT PARAMETERS:
650	DF - decision forest model
651	XY - test set
652	NPoints - test set size
653
654	RESULT:
655	root mean square error.
656	Its meaning for regression task is obvious. As for
657	classification task, RMS error means error when estimating posterior
658	probabilities.
659
660	-- ALGLIB --
661	Copyright 16.02.2009 by Bochkanov Sergey
662	*************************************************************************/
663	public static double dfrmserror(ref decisionforest df,
664	ref double[,] xy,
665	int npoints)
666	{
667	double result = 0;
668	double[] x = new double[0];
669	double[] y = new double[0];
670	int i = 0;
671	int j = 0;
672	int k = 0;
673	int tmpi = 0;
674	int i_ = 0;
675
676	x = new double[df.nvars-1+1];
677	y = new double[df.nclasses-1+1];
678	result = 0;
679	for(i=0; i<=npoints-1; i++)
680	{
681	for(i_=0; i_<=df.nvars-1;i_++)
682	{
683	x[i_] = xy[i,i_];
684	}
685	dfprocess(ref df, ref x, ref y);
686	if( df.nclasses>1 )
687	{
688
689	//
690	// classification-specific code
691	//
692	k = (int)Math.Round(xy[i,df.nvars]);
693	tmpi = 0;
694	for(j=1; j<=df.nclasses-1; j++)
695	{
696	if( (double)(y[j])>(double)(y[tmpi]) )
697	{
698	tmpi = j;
699	}
700	}
701	for(j=0; j<=df.nclasses-1; j++)
702	{
703	if( j==k )
704	{
705	result = result+AP.Math.Sqr(y[j]-1);
706	}
707	else
708	{
709	result = result+AP.Math.Sqr(y[j]);
710	}
711	}
712	}
713	else
714	{
715
716	//
717	// regression-specific code
718	//
719	result = result+AP.Math.Sqr(y[0]-xy[i,df.nvars]);
720	}
721	}
722	result = Math.Sqrt(result/(npoints*df.nclasses));
723	return result;
724	}
725
726
727	/*************************************************************************
728	Average error on the test set
729
730	INPUT PARAMETERS:
731	DF - decision forest model
732	XY - test set
733	NPoints - test set size
734
735	RESULT:
736	Its meaning for regression task is obvious. As for
737	classification task, it means average error when estimating posterior
738	probabilities.
739
740	-- ALGLIB --
741	Copyright 16.02.2009 by Bochkanov Sergey
742	*************************************************************************/
743	public static double dfavgerror(ref decisionforest df,
744	ref double[,] xy,
745	int npoints)
746	{
747	double result = 0;
748	double[] x = new double[0];
749	double[] y = new double[0];
750	int i = 0;
751	int j = 0;
752	int k = 0;
753	int i_ = 0;
754
755	x = new double[df.nvars-1+1];
756	y = new double[df.nclasses-1+1];
757	result = 0;
758	for(i=0; i<=npoints-1; i++)
759	{
760	for(i_=0; i_<=df.nvars-1;i_++)
761	{
762	x[i_] = xy[i,i_];
763	}
764	dfprocess(ref df, ref x, ref y);
765	if( df.nclasses>1 )
766	{
767
768	//
769	// classification-specific code
770	//
771	k = (int)Math.Round(xy[i,df.nvars]);
772	for(j=0; j<=df.nclasses-1; j++)
773	{
774	if( j==k )
775	{
776	result = result+Math.Abs(y[j]-1);
777	}
778	else
779	{
780	result = result+Math.Abs(y[j]);
781	}
782	}
783	}
784	else
785	{
786
787	//
788	// regression-specific code
789	//
790	result = result+Math.Abs(y[0]-xy[i,df.nvars]);
791	}
792	}
793	result = result/(npoints*df.nclasses);
794	return result;
795	}
796
797
798	/*************************************************************************
799	Average relative error on the test set
800
801	INPUT PARAMETERS:
802	DF - decision forest model
803	XY - test set
804	NPoints - test set size
805
806	RESULT:
807	Its meaning for regression task is obvious. As for
808	classification task, it means average relative error when estimating
809	posterior probability of belonging to the correct class.
810
811	-- ALGLIB --
812	Copyright 16.02.2009 by Bochkanov Sergey
813	*************************************************************************/
814	public static double dfavgrelerror(ref decisionforest df,
815	ref double[,] xy,
816	int npoints)
817	{
818	double result = 0;
819	double[] x = new double[0];
820	double[] y = new double[0];
821	int relcnt = 0;
822	int i = 0;
823	int j = 0;
824	int k = 0;
825	int i_ = 0;
826
827	x = new double[df.nvars-1+1];
828	y = new double[df.nclasses-1+1];
829	result = 0;
830	relcnt = 0;
831	for(i=0; i<=npoints-1; i++)
832	{
833	for(i_=0; i_<=df.nvars-1;i_++)
834	{
835	x[i_] = xy[i,i_];
836	}
837	dfprocess(ref df, ref x, ref y);
838	if( df.nclasses>1 )
839	{
840
841	//
842	// classification-specific code
843	//
844	k = (int)Math.Round(xy[i,df.nvars]);
845	for(j=0; j<=df.nclasses-1; j++)
846	{
847	if( j==k )
848	{
849	result = result+Math.Abs(y[j]-1);
850	relcnt = relcnt+1;
851	}
852	}
853	}
854	else
855	{
856
857	//
858	// regression-specific code
859	//
860	if( (double)(xy[i,df.nvars])!=(double)(0) )
861	{
862	result = result+Math.Abs((y[0]-xy[i,df.nvars])/xy[i,df.nvars]);
863	relcnt = relcnt+1;
864	}
865	}
866	}
867	if( relcnt>0 )
868	{
869	result = result/relcnt;
870	}
871	return result;
872	}
873
874
875	/*************************************************************************
876	Copying of DecisionForest strucure
877
878	INPUT PARAMETERS:
879	DF1 - original
880
881	OUTPUT PARAMETERS:
882	DF2 - copy
883
884	-- ALGLIB --
885	Copyright 13.02.2009 by Bochkanov Sergey
886	*************************************************************************/
887	public static void dfcopy(ref decisionforest df1,
888	ref decisionforest df2)
889	{
890	int i_ = 0;
891
892	df2.nvars = df1.nvars;
893	df2.nclasses = df1.nclasses;
894	df2.ntrees = df1.ntrees;
895	df2.bufsize = df1.bufsize;
896	df2.trees = new double[df1.bufsize-1+1];
897	for(i_=0; i_<=df1.bufsize-1;i_++)
898	{
899	df2.trees[i_] = df1.trees[i_];
900	}
901	}
902
903
904	/*************************************************************************
905	Serialization of DecisionForest strucure
906
907	INPUT PARAMETERS:
908	DF - original
909
910	OUTPUT PARAMETERS:
911	RA - array of real numbers which stores decision forest,
912	array[0..RLen-1]
913	RLen - RA lenght
914
915	-- ALGLIB --
916	Copyright 13.02.2009 by Bochkanov Sergey
917	*************************************************************************/
918	public static void dfserialize(ref decisionforest df,
919	ref double[] ra,
920	ref int rlen)
921	{
922	int i_ = 0;
923	int i1_ = 0;
924
925	ra = new double[df.bufsize+5-1+1];
926	ra[0] = dfvnum;
927	ra[1] = df.nvars;
928	ra[2] = df.nclasses;
929	ra[3] = df.ntrees;
930	ra[4] = df.bufsize;
931	i1_ = (0) - (5);
932	for(i_=5; i_<=5+df.bufsize-1;i_++)
933	{
934	ra[i_] = df.trees[i_+i1_];
935	}
936	rlen = 5+df.bufsize;
937	}
938
939
940	/*************************************************************************
941	Unserialization of DecisionForest strucure
942
943	INPUT PARAMETERS:
944	RA - real array which stores decision forest
945
946	OUTPUT PARAMETERS:
947	DF - restored structure
948
949	-- ALGLIB --
950	Copyright 13.02.2009 by Bochkanov Sergey
951	*************************************************************************/
952	public static void dfunserialize(ref double[] ra,
953	ref decisionforest df)
954	{
955	int i_ = 0;
956	int i1_ = 0;
957
958	System.Diagnostics.Debug.Assert((int)Math.Round(ra[0])==dfvnum, "DFUnserialize: incorrect array!");
959	df.nvars = (int)Math.Round(ra[1]);
960	df.nclasses = (int)Math.Round(ra[2]);
961	df.ntrees = (int)Math.Round(ra[3]);
962	df.bufsize = (int)Math.Round(ra[4]);
963	df.trees = new double[df.bufsize-1+1];
964	i1_ = (5) - (0);
965	for(i_=0; i_<=df.bufsize-1;i_++)
966	{
967	df.trees[i_] = ra[i_+i1_];
968	}
969	}
970
971
972	/*************************************************************************
973	Classification error
974	*************************************************************************/
975	private static int dfclserror(ref decisionforest df,
976	ref double[,] xy,
977	int npoints)
978	{
979	int result = 0;
980	double[] x = new double[0];
981	double[] y = new double[0];
982	int i = 0;
983	int j = 0;
984	int k = 0;
985	int tmpi = 0;
986	int i_ = 0;
987
988	if( df.nclasses<=1 )
989	{
990	result = 0;
991	return result;
992	}
993	x = new double[df.nvars-1+1];
994	y = new double[df.nclasses-1+1];
995	result = 0;
996	for(i=0; i<=npoints-1; i++)
997	{
998	for(i_=0; i_<=df.nvars-1;i_++)
999	{
1000	x[i_] = xy[i,i_];
1001	}
1002	dfprocess(ref df, ref x, ref y);
1003	k = (int)Math.Round(xy[i,df.nvars]);
1004	tmpi = 0;
1005	for(j=1; j<=df.nclasses-1; j++)
1006	{
1007	if( (double)(y[j])>(double)(y[tmpi]) )
1008	{
1009	tmpi = j;
1010	}
1011	}
1012	if( tmpi!=k )
1013	{
1014	result = result+1;
1015	}
1016	}
1017	return result;
1018	}
1019
1020
1021	/*************************************************************************
1022	Internal subroutine for processing one decision tree starting at Offs
1023	*************************************************************************/
1024	private static void dfprocessinternal(ref decisionforest df,
1025	int offs,
1026	ref double[] x,
1027	ref double[] y)
1028	{
1029	int k = 0;
1030	int idx = 0;
1031
1032
1033	//
1034	// Set pointer to the root
1035	//
1036	k = offs+1;
1037
1038	//
1039	// Navigate through the tree
1040	//
1041	while( true )
1042	{
1043	if( (double)(df.trees[k])==(double)(-1) )
1044	{
1045	if( df.nclasses==1 )
1046	{
1047	y[0] = y[0]+df.trees[k+1];
1048	}
1049	else
1050	{
1051	idx = (int)Math.Round(df.trees[k+1]);
1052	y[idx] = y[idx]+1;
1053	}
1054	break;
1055	}
1056	if( (double)(x[(int)Math.Round(df.trees[k])])<(double)(df.trees[k+1]) )
1057	{
1058	k = k+innernodewidth;
1059	}
1060	else
1061	{
1062	k = offs+(int)Math.Round(df.trees[k+2]);
1063	}
1064	}
1065	}
1066
1067
1068	/*************************************************************************
1069	Builds one decision tree. Just a wrapper for the DFBuildTreeRec.
1070	*************************************************************************/
1071	private static void dfbuildtree(ref double[,] xy,
1072	int npoints,
1073	int nvars,
1074	int nclasses,
1075	int nfeatures,
1076	int nvarsinpool,
1077	int flags,
1078	ref dfinternalbuffers bufs)
1079	{
1080	int numprocessed = 0;
1081	int i = 0;
1082
1083	System.Diagnostics.Debug.Assert(npoints>0);
1084
1085	//
1086	// Prepare IdxBuf. It stores indices of the training set elements.
1087	// When training set is being split, contents of IdxBuf is
1088	// correspondingly reordered so we can know which elements belong
1089	// to which branch of decision tree.
1090	//
1091	for(i=0; i<=npoints-1; i++)
1092	{
1093	bufs.idxbuf[i] = i;
1094	}
1095
1096	//
1097	// Recursive procedure
1098	//
1099	numprocessed = 1;
1100	dfbuildtreerec(ref xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, ref numprocessed, 0, npoints-1, ref bufs);
1101	bufs.treebuf[0] = numprocessed;
1102	}
1103
1104
1105	/*************************************************************************
1106	Builds one decision tree (internal recursive subroutine)
1107
1108	Parameters:
1109	TreeBuf - large enough array, at least TreeSize
1110	IdxBuf - at least NPoints elements
1111	TmpBufR - at least NPoints
1112	TmpBufR2 - at least NPoints
1113	TmpBufI - at least NPoints
1114	TmpBufI2 - at least NPoints+1
1115	*************************************************************************/
1116	private static void dfbuildtreerec(ref double[,] xy,
1117	int npoints,
1118	int nvars,
1119	int nclasses,
1120	int nfeatures,
1121	int nvarsinpool,
1122	int flags,
1123	ref int numprocessed,
1124	int idx1,
1125	int idx2,
1126	ref dfinternalbuffers bufs)
1127	{
1128	int i = 0;
1129	int j = 0;
1130	int k = 0;
1131	bool bflag = new bool();
1132	int i1 = 0;
1133	int i2 = 0;
1134	int info = 0;
1135	double sl = 0;
1136	double sr = 0;
1137	double w = 0;
1138	int idxbest = 0;
1139	double ebest = 0;
1140	double tbest = 0;
1141	int varcur = 0;
1142	double s = 0;
1143	double v = 0;
1144	double v1 = 0;
1145	double v2 = 0;
1146	double threshold = 0;
1147	int oldnp = 0;
1148	double currms = 0;
1149	bool useevs = new bool();
1150
1151	System.Diagnostics.Debug.Assert(npoints>0);
1152	System.Diagnostics.Debug.Assert(idx2>=idx1);
1153	useevs = flags/dfuseevs%2!=0;
1154
1155	//
1156	// Leaf node
1157	//
1158	if( idx2==idx1 )
1159	{
1160	bufs.treebuf[numprocessed] = -1;
1161	bufs.treebuf[numprocessed+1] = xy[bufs.idxbuf[idx1],nvars];
1162	numprocessed = numprocessed+leafnodewidth;
1163	return;
1164	}
1165
1166	//
1167	// Non-leaf node.
1168	// Select random variable, prepare split:
1169	// 1. prepare default solution - no splitting, class at random
1170	// 2. investigate possible splits, compare with default/best
1171	//
1172	idxbest = -1;
1173	if( nclasses>1 )
1174	{
1175
1176	//
1177	// default solution for classification
1178	//
1179	for(i=0; i<=nclasses-1; i++)
1180	{
1181	bufs.classibuf[i] = 0;
1182	}
1183	s = idx2-idx1+1;
1184	for(i=idx1; i<=idx2; i++)
1185	{
1186	j = (int)Math.Round(xy[bufs.idxbuf[i],nvars]);
1187	bufs.classibuf[j] = bufs.classibuf[j]+1;
1188	}
1189	ebest = 0;
1190	for(i=0; i<=nclasses-1; i++)
1191	{
1192	ebest = ebest+bufs.classibuf[i]AP.Math.Sqr(1-bufs.classibuf[i]/s)+(s-bufs.classibuf[i])AP.Math.Sqr(bufs.classibuf[i]/s);
1193	}
1194	ebest = Math.Sqrt(ebest/(nclasses*(idx2-idx1+1)));
1195	}
1196	else
1197	{
1198
1199	//
1200	// default solution for regression
1201	//
1202	v = 0;
1203	for(i=idx1; i<=idx2; i++)
1204	{
1205	v = v+xy[bufs.idxbuf[i],nvars];
1206	}
1207	v = v/(idx2-idx1+1);
1208	ebest = 0;
1209	for(i=idx1; i<=idx2; i++)
1210	{
1211	ebest = ebest+AP.Math.Sqr(xy[bufs.idxbuf[i],nvars]-v);
1212	}
1213	ebest = Math.Sqrt(ebest/(idx2-idx1+1));
1214	}
1215	i = 0;
1216	while( i<=Math.Min(nfeatures, nvarsinpool)-1 )
1217	{
1218
1219	//
1220	// select variables from pool
1221	//
1222	j = i+AP.Math.RandomInteger(nvarsinpool-i);
1223	k = bufs.varpool[i];
1224	bufs.varpool[i] = bufs.varpool[j];
1225	bufs.varpool[j] = k;
1226	varcur = bufs.varpool[i];
1227
1228	//
1229	// load variable values to working array
1230	//
1231	// apply EVS preprocessing: if all variable values are same,
1232	// variable is excluded from pool.
1233	//
1234	// This is necessary for binary pre-splits (see later) to work.
1235	//
1236	for(j=idx1; j<=idx2; j++)
1237	{
1238	bufs.tmpbufr[j-idx1] = xy[bufs.idxbuf[j],varcur];
1239	}
1240	if( useevs )
1241	{
1242	bflag = false;
1243	v = bufs.tmpbufr[0];
1244	for(j=0; j<=idx2-idx1; j++)
1245	{
1246	if( (double)(bufs.tmpbufr[j])!=(double)(v) )
1247	{
1248	bflag = true;
1249	break;
1250	}
1251	}
1252	if( !bflag )
1253	{
1254
1255	//
1256	// exclude variable from pool,
1257	// go to the next iteration.
1258	// I is not increased.
1259	//
1260	k = bufs.varpool[i];
1261	bufs.varpool[i] = bufs.varpool[nvarsinpool-1];
1262	bufs.varpool[nvarsinpool-1] = k;
1263	nvarsinpool = nvarsinpool-1;
1264	continue;
1265	}
1266	}
1267
1268	//
1269	// load labels to working array
1270	//
1271	if( nclasses>1 )
1272	{
1273	for(j=idx1; j<=idx2; j++)
1274	{
1275	bufs.tmpbufi[j-idx1] = (int)Math.Round(xy[bufs.idxbuf[j],nvars]);
1276	}
1277	}
1278	else
1279	{
1280	for(j=idx1; j<=idx2; j++)
1281	{
1282	bufs.tmpbufr2[j-idx1] = xy[bufs.idxbuf[j],nvars];
1283	}
1284	}
1285
1286	//
1287	// calculate split
1288	//
1289	if( useevs & bufs.evsbin[varcur] )
1290	{
1291
1292	//
1293	// Pre-calculated splits for binary variables.
1294	// Threshold is already known, just calculate RMS error
1295	//
1296	threshold = bufs.evssplits[varcur];
1297	if( nclasses>1 )
1298	{
1299
1300	//
1301	// classification-specific code
1302	//
1303	for(j=0; j<=2*nclasses-1; j++)
1304	{
1305	bufs.classibuf[j] = 0;
1306	}
1307	sl = 0;
1308	sr = 0;
1309	for(j=0; j<=idx2-idx1; j++)
1310	{
1311	k = bufs.tmpbufi[j];
1312	if( (double)(bufs.tmpbufr[j])<(double)(threshold) )
1313	{
1314	bufs.classibuf[k] = bufs.classibuf[k]+1;
1315	sl = sl+1;
1316	}
1317	else
1318	{
1319	bufs.classibuf[k+nclasses] = bufs.classibuf[k+nclasses]+1;
1320	sr = sr+1;
1321	}
1322	}
1323	System.Diagnostics.Debug.Assert((double)(sl)!=(double)(0) & (double)(sr)!=(double)(0), "DFBuildTreeRec: something strange!");
1324	currms = 0;
1325	for(j=0; j<=nclasses-1; j++)
1326	{
1327	w = bufs.classibuf[j];
1328	currms = currms+w*AP.Math.Sqr(w/sl-1);
1329	currms = currms+(sl-w)*AP.Math.Sqr(w/sl);
1330	w = bufs.classibuf[nclasses+j];
1331	currms = currms+w*AP.Math.Sqr(w/sr-1);
1332	currms = currms+(sr-w)*AP.Math.Sqr(w/sr);
1333	}
1334	currms = Math.Sqrt(currms/(nclasses*(idx2-idx1+1)));
1335	}
1336	else
1337	{
1338
1339	//
1340	// regression-specific code
1341	//
1342	sl = 0;
1343	sr = 0;
1344	v1 = 0;
1345	v2 = 0;
1346	for(j=0; j<=idx2-idx1; j++)
1347	{
1348	if( (double)(bufs.tmpbufr[j])<(double)(threshold) )
1349	{
1350	v1 = v1+bufs.tmpbufr2[j];
1351	sl = sl+1;
1352	}
1353	else
1354	{
1355	v2 = v2+bufs.tmpbufr2[j];
1356	sr = sr+1;
1357	}
1358	}
1359	System.Diagnostics.Debug.Assert((double)(sl)!=(double)(0) & (double)(sr)!=(double)(0), "DFBuildTreeRec: something strange!");
1360	v1 = v1/sl;
1361	v2 = v2/sr;
1362	currms = 0;
1363	for(j=0; j<=idx2-idx1; j++)
1364	{
1365	if( (double)(bufs.tmpbufr[j])<(double)(threshold) )
1366	{
1367	currms = currms+AP.Math.Sqr(v1-bufs.tmpbufr2[j]);
1368	}
1369	else
1370	{
1371	currms = currms+AP.Math.Sqr(v2-bufs.tmpbufr2[j]);
1372	}
1373	}
1374	currms = Math.Sqrt(currms/(idx2-idx1+1));
1375	}
1376	info = 1;
1377	}
1378	else
1379	{
1380
1381	//
1382	// Generic splits
1383	//
1384	if( nclasses>1 )
1385	{
1386	dfsplitc(ref bufs.tmpbufr, ref bufs.tmpbufi, ref bufs.classibuf, idx2-idx1+1, nclasses, dfusestrongsplits, ref info, ref threshold, ref currms);
1387	}
1388	else
1389	{
1390	dfsplitr(ref bufs.tmpbufr, ref bufs.tmpbufr2, idx2-idx1+1, dfusestrongsplits, ref info, ref threshold, ref currms);
1391	}
1392	}
1393	if( info>0 )
1394	{
1395	if( (double)(currms)<=(double)(ebest) )
1396	{
1397	ebest = currms;
1398	idxbest = varcur;
1399	tbest = threshold;
1400	}
1401	}
1402
1403	//
1404	// Next iteration
1405	//
1406	i = i+1;
1407	}
1408
1409	//
1410	// to split or not to split
1411	//
1412	if( idxbest<0 )
1413	{
1414
1415	//
1416	// All values are same, cannot split.
1417	//
1418	bufs.treebuf[numprocessed] = -1;
1419	if( nclasses>1 )
1420	{
1421
1422	//
1423	// Select random class label (randomness allows us to
1424	// approximate distribution of the classes)
1425	//
1426	bufs.treebuf[numprocessed+1] = (int)Math.Round(xy[bufs.idxbuf[idx1+AP.Math.RandomInteger(idx2-idx1+1)],nvars]);
1427	}
1428	else
1429	{
1430
1431	//
1432	// Select average (for regression task).
1433	//
1434	v = 0;
1435	for(i=idx1; i<=idx2; i++)
1436	{
1437	v = v+xy[bufs.idxbuf[i],nvars]/(idx2-idx1+1);
1438	}
1439	bufs.treebuf[numprocessed+1] = v;
1440	}
1441	numprocessed = numprocessed+leafnodewidth;
1442	}
1443	else
1444	{
1445
1446	//
1447	// we can split
1448	//
1449	bufs.treebuf[numprocessed] = idxbest;
1450	bufs.treebuf[numprocessed+1] = tbest;
1451	i1 = idx1;
1452	i2 = idx2;
1453	while( i1<=i2 )
1454	{
1455
1456	//
1457	// Reorder indices so that left partition is in [Idx1..I1-1],
1458	// and right partition is in [I2+1..Idx2]
1459	//
1460	if( (double)(xy[bufs.idxbuf[i1],idxbest])<(double)(tbest) )
1461	{
1462	i1 = i1+1;
1463	continue;
1464	}
1465	if( (double)(xy[bufs.idxbuf[i2],idxbest])>=(double)(tbest) )
1466	{
1467	i2 = i2-1;
1468	continue;
1469	}
1470	j = bufs.idxbuf[i1];
1471	bufs.idxbuf[i1] = bufs.idxbuf[i2];
1472	bufs.idxbuf[i2] = j;
1473	i1 = i1+1;
1474	i2 = i2-1;
1475	}
1476	oldnp = numprocessed;
1477	numprocessed = numprocessed+innernodewidth;
1478	dfbuildtreerec(ref xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, ref numprocessed, idx1, i1-1, ref bufs);
1479	bufs.treebuf[oldnp+2] = numprocessed;
1480	dfbuildtreerec(ref xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, ref numprocessed, i2+1, idx2, ref bufs);
1481	}
1482	}
1483
1484
1485	/*************************************************************************
1486	Makes weak split on attribute
1487	*************************************************************************/
1488	private static void dfweakspliti(ref double[] x,
1489	ref int[] y,
1490	int n,
1491	int nclasses,
1492	ref int info,
1493	ref double threshold,
1494	ref double e)
1495	{
1496	int i = 0;
1497	int neq = 0;
1498	int nless = 0;
1499	int ngreater = 0;
1500
1501	tsort.tagsortfasti(ref x, ref y, n);
1502	if( n%2==1 )
1503	{
1504
1505	//
1506	// odd number of elements
1507	//
1508	threshold = x[n/2];
1509	}
1510	else
1511	{
1512
1513	//
1514	// even number of elements.
1515	//
1516	// if two closest to the middle of the array are equal,
1517	// we will select one of them (to avoid possible problems with
1518	// floating point errors).
1519	// we will select halfsum otherwise.
1520	//
1521	if( (double)(x[n/2-1])==(double)(x[n/2]) )
1522	{
1523	threshold = x[n/2-1];
1524	}
1525	else
1526	{
1527	threshold = 0.5*(x[n/2-1]+x[n/2]);
1528	}
1529	}
1530	neq = 0;
1531	nless = 0;
1532	ngreater = 0;
1533	for(i=0; i<=n-1; i++)
1534	{
1535	if( (double)(x[i])<(double)(threshold) )
1536	{
1537	nless = nless+1;
1538	}
1539	if( (double)(x[i])==(double)(threshold) )
1540	{
1541	neq = neq+1;
1542	}
1543	if( (double)(x[i])>(double)(threshold) )
1544	{
1545	ngreater = ngreater+1;
1546	}
1547	}
1548	if( nless==0 & ngreater==0 )
1549	{
1550	info = -3;
1551	}
1552	else
1553	{
1554	if( neq!=0 )
1555	{
1556	if( nless<ngreater )
1557	{
1558	threshold = 0.5*(x[nless+neq-1]+x[nless+neq]);
1559	}
1560	else
1561	{
1562	threshold = 0.5*(x[nless-1]+x[nless]);
1563	}
1564	}
1565	info = 1;
1566	e = 0;
1567	}
1568	}
1569
1570
1571	/*************************************************************************
1572	Makes split on attribute
1573	*************************************************************************/
1574	private static void dfsplitc(ref double[] x,
1575	ref int[] c,
1576	ref int[] cntbuf,
1577	int n,
1578	int nc,
1579	int flags,
1580	ref int info,
1581	ref double threshold,
1582	ref double e)
1583	{
1584	int i = 0;
1585	int neq = 0;
1586	int nless = 0;
1587	int ngreater = 0;
1588	int q = 0;
1589	int qmin = 0;
1590	int qmax = 0;
1591	int qcnt = 0;
1592	double cursplit = 0;
1593	int nleft = 0;
1594	double v = 0;
1595	double cure = 0;
1596	double w = 0;
1597	double sl = 0;
1598	double sr = 0;
1599
1600	tsort.tagsortfasti(ref x, ref c, n);
1601	e = AP.Math.MaxRealNumber;
1602	threshold = 0.5*(x[0]+x[n-1]);
1603	info = -3;
1604	if( flags/dfusestrongsplits%2==0 )
1605	{
1606
1607	//
1608	// weak splits, split at half
1609	//
1610	qcnt = 2;
1611	qmin = 1;
1612	qmax = 1;
1613	}
1614	else
1615	{
1616
1617	//
1618	// strong splits: choose best quartile
1619	//
1620	qcnt = 4;
1621	qmin = 1;
1622	qmax = 3;
1623	}
1624	for(q=qmin; q<=qmax; q++)
1625	{
1626	cursplit = x[n*q/qcnt];
1627	neq = 0;
1628	nless = 0;
1629	ngreater = 0;
1630	for(i=0; i<=n-1; i++)
1631	{
1632	if( (double)(x[i])<(double)(cursplit) )
1633	{
1634	nless = nless+1;
1635	}
1636	if( (double)(x[i])==(double)(cursplit) )
1637	{
1638	neq = neq+1;
1639	}
1640	if( (double)(x[i])>(double)(cursplit) )
1641	{
1642	ngreater = ngreater+1;
1643	}
1644	}
1645	System.Diagnostics.Debug.Assert(neq!=0, "DFSplitR: NEq=0, something strange!!!");
1646	if( nless!=0 \| ngreater!=0 )
1647	{
1648
1649	//
1650	// set threshold between two partitions, with
1651	// some tweaking to avoid problems with floating point
1652	// arithmetics.
1653	//
1654	// The problem is that when you calculates C = 0.5*(A+B) there
1655	// can be no C which lies strictly between A and B (for example,
1656	// there is no floating point number which is
1657	// greater than 1 and less than 1+eps). In such situations
1658	// we choose right side as theshold (remember that
1659	// points which lie on threshold falls to the right side).
1660	//
1661	if( nless<ngreater )
1662	{
1663	cursplit = 0.5*(x[nless+neq-1]+x[nless+neq]);
1664	nleft = nless+neq;
1665	if( (double)(cursplit)<=(double)(x[nless+neq-1]) )
1666	{
1667	cursplit = x[nless+neq];
1668	}
1669	}
1670	else
1671	{
1672	cursplit = 0.5*(x[nless-1]+x[nless]);
1673	nleft = nless;
1674	if( (double)(cursplit)<=(double)(x[nless-1]) )
1675	{
1676	cursplit = x[nless];
1677	}
1678	}
1679	info = 1;
1680	cure = 0;
1681	for(i=0; i<=2*nc-1; i++)
1682	{
1683	cntbuf[i] = 0;
1684	}
1685	for(i=0; i<=nleft-1; i++)
1686	{
1687	cntbuf[c[i]] = cntbuf[c[i]]+1;
1688	}
1689	for(i=nleft; i<=n-1; i++)
1690	{
1691	cntbuf[nc+c[i]] = cntbuf[nc+c[i]]+1;
1692	}
1693	sl = nleft;
1694	sr = n-nleft;
1695	v = 0;
1696	for(i=0; i<=nc-1; i++)
1697	{
1698	w = cntbuf[i];
1699	v = v+w*AP.Math.Sqr(w/sl-1);
1700	v = v+(sl-w)*AP.Math.Sqr(w/sl);
1701	w = cntbuf[nc+i];
1702	v = v+w*AP.Math.Sqr(w/sr-1);
1703	v = v+(sr-w)*AP.Math.Sqr(w/sr);
1704	}
1705	cure = Math.Sqrt(v/(nc*n));
1706	if( (double)(cure)<(double)(e) )
1707	{
1708	threshold = cursplit;
1709	e = cure;
1710	}
1711	}
1712	}
1713	}
1714
1715
1716	/*************************************************************************
1717	Makes split on attribute
1718	*************************************************************************/
1719	private static void dfsplitr(ref double[] x,
1720	ref double[] y,
1721	int n,
1722	int flags,
1723	ref int info,
1724	ref double threshold,
1725	ref double e)
1726	{
1727	int i = 0;
1728	int neq = 0;
1729	int nless = 0;
1730	int ngreater = 0;
1731	int q = 0;
1732	int qmin = 0;
1733	int qmax = 0;
1734	int qcnt = 0;
1735	double cursplit = 0;
1736	int nleft = 0;
1737	double v = 0;
1738	double cure = 0;
1739
1740	tsort.tagsortfastr(ref x, ref y, n);
1741	e = AP.Math.MaxRealNumber;
1742	threshold = 0.5*(x[0]+x[n-1]);
1743	info = -3;
1744	if( flags/dfusestrongsplits%2==0 )
1745	{
1746
1747	//
1748	// weak splits, split at half
1749	//
1750	qcnt = 2;
1751	qmin = 1;
1752	qmax = 1;
1753	}
1754	else
1755	{
1756
1757	//
1758	// strong splits: choose best quartile
1759	//
1760	qcnt = 4;
1761	qmin = 1;
1762	qmax = 3;
1763	}
1764	for(q=qmin; q<=qmax; q++)
1765	{
1766	cursplit = x[n*q/qcnt];
1767	neq = 0;
1768	nless = 0;
1769	ngreater = 0;
1770	for(i=0; i<=n-1; i++)
1771	{
1772	if( (double)(x[i])<(double)(cursplit) )
1773	{
1774	nless = nless+1;
1775	}
1776	if( (double)(x[i])==(double)(cursplit) )
1777	{
1778	neq = neq+1;
1779	}
1780	if( (double)(x[i])>(double)(cursplit) )
1781	{
1782	ngreater = ngreater+1;
1783	}
1784	}
1785	System.Diagnostics.Debug.Assert(neq!=0, "DFSplitR: NEq=0, something strange!!!");
1786	if( nless!=0 \| ngreater!=0 )
1787	{
1788
1789	//
1790	// set threshold between two partitions, with
1791	// some tweaking to avoid problems with floating point
1792	// arithmetics.
1793	//
1794	// The problem is that when you calculates C = 0.5*(A+B) there
1795	// can be no C which lies strictly between A and B (for example,
1796	// there is no floating point number which is
1797	// greater than 1 and less than 1+eps). In such situations
1798	// we choose right side as theshold (remember that
1799	// points which lie on threshold falls to the right side).
1800	//
1801	if( nless<ngreater )
1802	{
1803	cursplit = 0.5*(x[nless+neq-1]+x[nless+neq]);
1804	nleft = nless+neq;
1805	if( (double)(cursplit)<=(double)(x[nless+neq-1]) )
1806	{
1807	cursplit = x[nless+neq];
1808	}
1809	}
1810	else
1811	{
1812	cursplit = 0.5*(x[nless-1]+x[nless]);
1813	nleft = nless;
1814	if( (double)(cursplit)<=(double)(x[nless-1]) )
1815	{
1816	cursplit = x[nless];
1817	}
1818	}
1819	info = 1;
1820	cure = 0;
1821	v = 0;
1822	for(i=0; i<=nleft-1; i++)
1823	{
1824	v = v+y[i];
1825	}
1826	v = v/nleft;
1827	for(i=0; i<=nleft-1; i++)
1828	{
1829	cure = cure+AP.Math.Sqr(y[i]-v);
1830	}
1831	v = 0;
1832	for(i=nleft; i<=n-1; i++)
1833	{
1834	v = v+y[i];
1835	}
1836	v = v/(n-nleft);
1837	for(i=nleft; i<=n-1; i++)
1838	{
1839	cure = cure+AP.Math.Sqr(y[i]-v);
1840	}
1841	cure = Math.Sqrt(cure/n);
1842	if( (double)(cure)<(double)(e) )
1843	{
1844	threshold = cursplit;
1845	e = cure;
1846	}
1847	}
1848	}
1849	}
1850	}
1851	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences