Context Navigation

source: trunk/sources/ALGLIB/dforest.cs @ 2636

Visit:

Last change on this file since 2636 was 2563, checked in by gkronber, 15 years ago
Updated ALGLIB to latest version. #751 (Plugin for for data-modeling with ANN (integrated into CEDMA))
File size: 64.0 KB

Line
1	/*************************************************************************
2	Copyright (c) 2009, Sergey Bochkanov (ALGLIB project).
3
4	>>> SOURCE LICENSE >>>
5	This program is free software; you can redistribute it and/or modify
6	it under the terms of the GNU General Public License as published by
7	the Free Software Foundation (www.fsf.org); either version 2 of the
8	License, or (at your option) any later version.
9
10	This program is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	A copy of the GNU General Public License is available at
16	http://www.fsf.org/licensing/licenses
17
18	>>> END OF LICENSE >>>
19	*************************************************************************/
20
21	using System;
22
23	namespace alglib
24	{
25	public class dforest
26	{
27	public struct decisionforest
28	{
29	public int nvars;
30	public int nclasses;
31	public int ntrees;
32	public int bufsize;
33	public double[] trees;
34	};
35
36
37	public struct dfreport
38	{
39	public double relclserror;
40	public double avgce;
41	public double rmserror;
42	public double avgerror;
43	public double avgrelerror;
44	public double oobrelclserror;
45	public double oobavgce;
46	public double oobrmserror;
47	public double oobavgerror;
48	public double oobavgrelerror;
49	};
50
51
52	public struct dfinternalbuffers
53	{
54	public double[] treebuf;
55	public int[] idxbuf;
56	public double[] tmpbufr;
57	public double[] tmpbufr2;
58	public int[] tmpbufi;
59	public int[] classibuf;
60	public int[] varpool;
61	public bool[] evsbin;
62	public double[] evssplits;
63	};
64
65
66
67
68	public const int dfvnum = 8;
69	public const int innernodewidth = 3;
70	public const int leafnodewidth = 2;
71	public const int dfusestrongsplits = 1;
72	public const int dfuseevs = 2;
73
74
75	/*************************************************************************
76	This subroutine builds random decision forest.
77
78	INPUT PARAMETERS:
79	XY - training set
80	NPoints - training set size, NPoints>=1
81	NVars - number of independent variables, NVars>=1
82	NClasses - task type:
83	* NClasses=1 - regression task with one
84	dependent variable
85	* NClasses>1 - classification task with
86	NClasses classes.
87	NTrees - number of trees in a forest, NTrees>=1.
88	recommended values: 50-100.
89	R - percent of a training set used to build
90	individual trees. 0<R<=1.
91	recommended values: 0.1 <= R <= 0.66.
92
93	OUTPUT PARAMETERS:
94	Info - return code:
95	* -2, if there is a point with class number
96	outside of [0..NClasses-1].
97	* -1, if incorrect parameters was passed
98	(NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0
99	or R>1).
100	* 1, if task has been solved
101	DF - model built
102	Rep - training report, contains error on a training set
103	and out-of-bag estimates of generalization error.
104
105	-- ALGLIB --
106	Copyright 19.02.2009 by Bochkanov Sergey
107	*************************************************************************/
108	public static void dfbuildrandomdecisionforest(ref double[,] xy,
109	int npoints,
110	int nvars,
111	int nclasses,
112	int ntrees,
113	double r,
114	ref int info,
115	ref decisionforest df,
116	ref dfreport rep)
117	{
118	int samplesize = 0;
119
120	if( (double)(r)<=(double)(0) \| (double)(r)>(double)(1) )
121	{
122	info = -1;
123	return;
124	}
125	samplesize = Math.Max((int)Math.Round(r*npoints), 1);
126	dfbuildinternal(ref xy, npoints, nvars, nclasses, ntrees, samplesize, Math.Max(nvars/2, 1), dfusestrongsplits+dfuseevs, ref info, ref df, ref rep);
127	}
128
129
130	public static void dfbuildinternal(ref double[,] xy,
131	int npoints,
132	int nvars,
133	int nclasses,
134	int ntrees,
135	int samplesize,
136	int nfeatures,
137	int flags,
138	ref int info,
139	ref decisionforest df,
140	ref dfreport rep)
141	{
142	int i = 0;
143	int j = 0;
144	int k = 0;
145	int tmpi = 0;
146	int lasttreeoffs = 0;
147	int offs = 0;
148	int ooboffs = 0;
149	int treesize = 0;
150	int nvarsinpool = 0;
151	bool useevs = new bool();
152	dfinternalbuffers bufs = new dfinternalbuffers();
153	int[] permbuf = new int[0];
154	double[] oobbuf = new double[0];
155	int[] oobcntbuf = new int[0];
156	double[,] xys = new double[0,0];
157	double[] x = new double[0];
158	double[] y = new double[0];
159	int oobcnt = 0;
160	int oobrelcnt = 0;
161	double v = 0;
162	double vmin = 0;
163	double vmax = 0;
164	bool bflag = new bool();
165	int i_ = 0;
166	int i1_ = 0;
167
168
169	//
170	// Test for inputs
171	//
172	if( npoints<1 \| samplesize<1 \| samplesize>npoints \| nvars<1 \| nclasses<1 \| ntrees<1 \| nfeatures<1 )
173	{
174	info = -1;
175	return;
176	}
177	if( nclasses>1 )
178	{
179	for(i=0; i<=npoints-1; i++)
180	{
181	if( (int)Math.Round(xy[i,nvars])<0 \| (int)Math.Round(xy[i,nvars])>=nclasses )
182	{
183	info = -2;
184	return;
185	}
186	}
187	}
188	info = 1;
189
190	//
191	// Flags
192	//
193	useevs = flags/dfuseevs%2!=0;
194
195	//
196	// Allocate data, prepare header
197	//
198	treesize = 1+innernodewidth(samplesize-1)+leafnodewidthsamplesize;
199	permbuf = new int[npoints-1+1];
200	bufs.treebuf = new double[treesize-1+1];
201	bufs.idxbuf = new int[npoints-1+1];
202	bufs.tmpbufr = new double[npoints-1+1];
203	bufs.tmpbufr2 = new double[npoints-1+1];
204	bufs.tmpbufi = new int[npoints-1+1];
205	bufs.varpool = new int[nvars-1+1];
206	bufs.evsbin = new bool[nvars-1+1];
207	bufs.evssplits = new double[nvars-1+1];
208	bufs.classibuf = new int[2*nclasses-1+1];
209	oobbuf = new double[nclasses*npoints-1+1];
210	oobcntbuf = new int[npoints-1+1];
211	df.trees = new double[ntrees*treesize-1+1];
212	xys = new double[samplesize-1+1, nvars+1];
213	x = new double[nvars-1+1];
214	y = new double[nclasses-1+1];
215	for(i=0; i<=npoints-1; i++)
216	{
217	permbuf[i] = i;
218	}
219	for(i=0; i<=npoints*nclasses-1; i++)
220	{
221	oobbuf[i] = 0;
222	}
223	for(i=0; i<=npoints-1; i++)
224	{
225	oobcntbuf[i] = 0;
226	}
227
228	//
229	// Prepare variable pool and EVS (extended variable selection/splitting) buffers
230	// (whether EVS is turned on or not):
231	// 1. detect binary variables and pre-calculate splits for them
232	// 2. detect variables with non-distinct values and exclude them from pool
233	//
234	for(i=0; i<=nvars-1; i++)
235	{
236	bufs.varpool[i] = i;
237	}
238	nvarsinpool = nvars;
239	if( useevs )
240	{
241	for(j=0; j<=nvars-1; j++)
242	{
243	vmin = xy[0,j];
244	vmax = vmin;
245	for(i=0; i<=npoints-1; i++)
246	{
247	v = xy[i,j];
248	vmin = Math.Min(vmin, v);
249	vmax = Math.Max(vmax, v);
250	}
251	if( (double)(vmin)==(double)(vmax) )
252	{
253
254	//
255	// exclude variable from pool
256	//
257	bufs.varpool[j] = bufs.varpool[nvarsinpool-1];
258	bufs.varpool[nvarsinpool-1] = -1;
259	nvarsinpool = nvarsinpool-1;
260	continue;
261	}
262	bflag = false;
263	for(i=0; i<=npoints-1; i++)
264	{
265	v = xy[i,j];
266	if( (double)(v)!=(double)(vmin) & (double)(v)!=(double)(vmax) )
267	{
268	bflag = true;
269	break;
270	}
271	}
272	if( bflag )
273	{
274
275	//
276	// non-binary variable
277	//
278	bufs.evsbin[j] = false;
279	}
280	else
281	{
282
283	//
284	// Prepare
285	//
286	bufs.evsbin[j] = true;
287	bufs.evssplits[j] = 0.5*(vmin+vmax);
288	if( (double)(bufs.evssplits[j])<=(double)(vmin) )
289	{
290	bufs.evssplits[j] = vmax;
291	}
292	}
293	}
294	}
295
296	//
297	// RANDOM FOREST FORMAT
298	// W[0] - size of array
299	// W[1] - version number
300	// W[2] - NVars
301	// W[3] - NClasses (1 for regression)
302	// W[4] - NTrees
303	// W[5] - trees offset
304	//
305	//
306	// TREE FORMAT
307	// W[Offs] - size of sub-array
308	// node info:
309	// W[K+0] - variable number (-1 for leaf mode)
310	// W[K+1] - threshold (class/value for leaf node)
311	// W[K+2] - ">=" branch index (absent for leaf node)
312	//
313	//
314	df.nvars = nvars;
315	df.nclasses = nclasses;
316	df.ntrees = ntrees;
317
318	//
319	// Build forest
320	//
321	offs = 0;
322	for(i=0; i<=ntrees-1; i++)
323	{
324
325	//
326	// Prepare sample
327	//
328	for(k=0; k<=samplesize-1; k++)
329	{
330	j = k+AP.Math.RandomInteger(npoints-k);
331	tmpi = permbuf[k];
332	permbuf[k] = permbuf[j];
333	permbuf[j] = tmpi;
334	j = permbuf[k];
335	for(i_=0; i_<=nvars;i_++)
336	{
337	xys[k,i_] = xy[j,i_];
338	}
339	}
340
341	//
342	// build tree, copy
343	//
344	dfbuildtree(ref xys, samplesize, nvars, nclasses, nfeatures, nvarsinpool, flags, ref bufs);
345	j = (int)Math.Round(bufs.treebuf[0]);
346	i1_ = (0) - (offs);
347	for(i_=offs; i_<=offs+j-1;i_++)
348	{
349	df.trees[i_] = bufs.treebuf[i_+i1_];
350	}
351	lasttreeoffs = offs;
352	offs = offs+j;
353
354	//
355	// OOB estimates
356	//
357	for(k=samplesize; k<=npoints-1; k++)
358	{
359	for(j=0; j<=nclasses-1; j++)
360	{
361	y[j] = 0;
362	}
363	j = permbuf[k];
364	for(i_=0; i_<=nvars-1;i_++)
365	{
366	x[i_] = xy[j,i_];
367	}
368	dfprocessinternal(ref df, lasttreeoffs, ref x, ref y);
369	i1_ = (0) - (j*nclasses);
370	for(i_=jnclasses; i_<=(j+1)nclasses-1;i_++)
371	{
372	oobbuf[i_] = oobbuf[i_] + y[i_+i1_];
373	}
374	oobcntbuf[j] = oobcntbuf[j]+1;
375	}
376	}
377	df.bufsize = offs;
378
379	//
380	// Normalize OOB results
381	//
382	for(i=0; i<=npoints-1; i++)
383	{
384	if( oobcntbuf[i]!=0 )
385	{
386	v = (double)(1)/(double)(oobcntbuf[i]);
387	for(i_=inclasses; i_<=inclasses+nclasses-1;i_++)
388	{
389	oobbuf[i_] = v*oobbuf[i_];
390	}
391	}
392	}
393
394	//
395	// Calculate training set estimates
396	//
397	rep.relclserror = dfrelclserror(ref df, ref xy, npoints);
398	rep.avgce = dfavgce(ref df, ref xy, npoints);
399	rep.rmserror = dfrmserror(ref df, ref xy, npoints);
400	rep.avgerror = dfavgerror(ref df, ref xy, npoints);
401	rep.avgrelerror = dfavgrelerror(ref df, ref xy, npoints);
402
403	//
404	// Calculate OOB estimates.
405	//
406	rep.oobrelclserror = 0;
407	rep.oobavgce = 0;
408	rep.oobrmserror = 0;
409	rep.oobavgerror = 0;
410	rep.oobavgrelerror = 0;
411	oobcnt = 0;
412	oobrelcnt = 0;
413	for(i=0; i<=npoints-1; i++)
414	{
415	if( oobcntbuf[i]!=0 )
416	{
417	ooboffs = i*nclasses;
418	if( nclasses>1 )
419	{
420
421	//
422	// classification-specific code
423	//
424	k = (int)Math.Round(xy[i,nvars]);
425	tmpi = 0;
426	for(j=1; j<=nclasses-1; j++)
427	{
428	if( (double)(oobbuf[ooboffs+j])>(double)(oobbuf[ooboffs+tmpi]) )
429	{
430	tmpi = j;
431	}
432	}
433	if( tmpi!=k )
434	{
435	rep.oobrelclserror = rep.oobrelclserror+1;
436	}
437	if( (double)(oobbuf[ooboffs+k])!=(double)(0) )
438	{
439	rep.oobavgce = rep.oobavgce-Math.Log(oobbuf[ooboffs+k]);
440	}
441	else
442	{
443	rep.oobavgce = rep.oobavgce-Math.Log(AP.Math.MinRealNumber);
444	}
445	for(j=0; j<=nclasses-1; j++)
446	{
447	if( j==k )
448	{
449	rep.oobrmserror = rep.oobrmserror+AP.Math.Sqr(oobbuf[ooboffs+j]-1);
450	rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]-1);
451	rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs(oobbuf[ooboffs+j]-1);
452	oobrelcnt = oobrelcnt+1;
453	}
454	else
455	{
456	rep.oobrmserror = rep.oobrmserror+AP.Math.Sqr(oobbuf[ooboffs+j]);
457	rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]);
458	}
459	}
460	}
461	else
462	{
463
464	//
465	// regression-specific code
466	//
467	rep.oobrmserror = rep.oobrmserror+AP.Math.Sqr(oobbuf[ooboffs]-xy[i,nvars]);
468	rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs]-xy[i,nvars]);
469	if( (double)(xy[i,nvars])!=(double)(0) )
470	{
471	rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs((oobbuf[ooboffs]-xy[i,nvars])/xy[i,nvars]);
472	oobrelcnt = oobrelcnt+1;
473	}
474	}
475
476	//
477	// update OOB estimates count.
478	//
479	oobcnt = oobcnt+1;
480	}
481	}
482	if( oobcnt>0 )
483	{
484	rep.oobrelclserror = rep.oobrelclserror/oobcnt;
485	rep.oobavgce = rep.oobavgce/oobcnt;
486	rep.oobrmserror = Math.Sqrt(rep.oobrmserror/(oobcnt*nclasses));
487	rep.oobavgerror = rep.oobavgerror/(oobcnt*nclasses);
488	if( oobrelcnt>0 )
489	{
490	rep.oobavgrelerror = rep.oobavgrelerror/oobrelcnt;
491	}
492	}
493	}
494
495
496	/*************************************************************************
497	Procesing
498
499	INPUT PARAMETERS:
500	DF - decision forest model
501	X - input vector, array[0..NVars-1].
502
503	OUTPUT PARAMETERS:
504	Y - result. Regression estimate when solving regression task,
505	vector of posterior probabilities for classification task.
506	Subroutine does not allocate memory for this vector, it is
507	responsibility of a caller to allocate it. Array must be
508	at least [0..NClasses-1].
509
510	-- ALGLIB --
511	Copyright 16.02.2009 by Bochkanov Sergey
512	*************************************************************************/
513	public static void dfprocess(ref decisionforest df,
514	ref double[] x,
515	ref double[] y)
516	{
517	int offs = 0;
518	int i = 0;
519	double v = 0;
520	int i_ = 0;
521
522
523	//
524	// Proceed
525	//
526	offs = 0;
527	for(i=0; i<=df.nclasses-1; i++)
528	{
529	y[i] = 0;
530	}
531	for(i=0; i<=df.ntrees-1; i++)
532	{
533
534	//
535	// Process basic tree
536	//
537	dfprocessinternal(ref df, offs, ref x, ref y);
538
539	//
540	// Next tree
541	//
542	offs = offs+(int)Math.Round(df.trees[offs]);
543	}
544	v = (double)(1)/(double)(df.ntrees);
545	for(i_=0; i_<=df.nclasses-1;i_++)
546	{
547	y[i_] = v*y[i_];
548	}
549	}
550
551
552	/*************************************************************************
553	Relative classification error on the test set
554
555	INPUT PARAMETERS:
556	DF - decision forest model
557	XY - test set
558	NPoints - test set size
559
560	RESULT:
561	percent of incorrectly classified cases.
562	Zero if model solves regression task.
563
564	-- ALGLIB --
565	Copyright 16.02.2009 by Bochkanov Sergey
566	*************************************************************************/
567	public static double dfrelclserror(ref decisionforest df,
568	ref double[,] xy,
569	int npoints)
570	{
571	double result = 0;
572
573	result = (double)(dfclserror(ref df, ref xy, npoints))/(double)(npoints);
574	return result;
575	}
576
577
578	/*************************************************************************
579	Average cross-entropy (in bits per element) on the test set
580
581	INPUT PARAMETERS:
582	DF - decision forest model
583	XY - test set
584	NPoints - test set size
585
586	RESULT:
587	CrossEntropy/(NPoints*LN(2)).
588	Zero if model solves regression task.
589
590	-- ALGLIB --
591	Copyright 16.02.2009 by Bochkanov Sergey
592	*************************************************************************/
593	public static double dfavgce(ref decisionforest df,
594	ref double[,] xy,
595	int npoints)
596	{
597	double result = 0;
598	double[] x = new double[0];
599	double[] y = new double[0];
600	int i = 0;
601	int j = 0;
602	int k = 0;
603	int tmpi = 0;
604	int i_ = 0;
605
606	x = new double[df.nvars-1+1];
607	y = new double[df.nclasses-1+1];
608	result = 0;
609	for(i=0; i<=npoints-1; i++)
610	{
611	for(i_=0; i_<=df.nvars-1;i_++)
612	{
613	x[i_] = xy[i,i_];
614	}
615	dfprocess(ref df, ref x, ref y);
616	if( df.nclasses>1 )
617	{
618
619	//
620	// classification-specific code
621	//
622	k = (int)Math.Round(xy[i,df.nvars]);
623	tmpi = 0;
624	for(j=1; j<=df.nclasses-1; j++)
625	{
626	if( (double)(y[j])>(double)(y[tmpi]) )
627	{
628	tmpi = j;
629	}
630	}
631	if( (double)(y[k])!=(double)(0) )
632	{
633	result = result-Math.Log(y[k]);
634	}
635	else
636	{
637	result = result-Math.Log(AP.Math.MinRealNumber);
638	}
639	}
640	}
641	result = result/npoints;
642	return result;
643	}
644
645
646	/*************************************************************************
647	RMS error on the test set
648
649	INPUT PARAMETERS:
650	DF - decision forest model
651	XY - test set
652	NPoints - test set size
653
654	RESULT:
655	root mean square error.
656	Its meaning for regression task is obvious. As for
657	classification task, RMS error means error when estimating posterior
658	probabilities.
659
660	-- ALGLIB --
661	Copyright 16.02.2009 by Bochkanov Sergey
662	*************************************************************************/
663	public static double dfrmserror(ref decisionforest df,
664	ref double[,] xy,
665	int npoints)
666	{
667	double result = 0;
668	double[] x = new double[0];
669	double[] y = new double[0];
670	int i = 0;
671	int j = 0;
672	int k = 0;
673	int tmpi = 0;
674	int i_ = 0;
675
676	x = new double[df.nvars-1+1];
677	y = new double[df.nclasses-1+1];
678	result = 0;
679	for(i=0; i<=npoints-1; i++)
680	{
681	for(i_=0; i_<=df.nvars-1;i_++)
682	{
683	x[i_] = xy[i,i_];
684	}
685	dfprocess(ref df, ref x, ref y);
686	if( df.nclasses>1 )
687	{
688
689	//
690	// classification-specific code
691	//
692	k = (int)Math.Round(xy[i,df.nvars]);
693	tmpi = 0;
694	for(j=1; j<=df.nclasses-1; j++)
695	{
696	if( (double)(y[j])>(double)(y[tmpi]) )
697	{
698	tmpi = j;
699	}
700	}
701	for(j=0; j<=df.nclasses-1; j++)
702	{
703	if( j==k )
704	{
705	result = result+AP.Math.Sqr(y[j]-1);
706	}
707	else
708	{
709	result = result+AP.Math.Sqr(y[j]);
710	}
711	}
712	}
713	else
714	{
715
716	//
717	// regression-specific code
718	//
719	result = result+AP.Math.Sqr(y[0]-xy[i,df.nvars]);
720	}
721	}
722	result = Math.Sqrt(result/(npoints*df.nclasses));
723	return result;
724	}
725
726
727	/*************************************************************************
728	Average error on the test set
729
730	INPUT PARAMETERS:
731	DF - decision forest model
732	XY - test set
733	NPoints - test set size
734
735	RESULT:
736	Its meaning for regression task is obvious. As for
737	classification task, it means average error when estimating posterior
738	probabilities.
739
740	-- ALGLIB --
741	Copyright 16.02.2009 by Bochkanov Sergey
742	*************************************************************************/
743	public static double dfavgerror(ref decisionforest df,
744	ref double[,] xy,
745	int npoints)
746	{
747	double result = 0;
748	double[] x = new double[0];
749	double[] y = new double[0];
750	int i = 0;
751	int j = 0;
752	int k = 0;
753	int i_ = 0;
754
755	x = new double[df.nvars-1+1];
756	y = new double[df.nclasses-1+1];
757	result = 0;
758	for(i=0; i<=npoints-1; i++)
759	{
760	for(i_=0; i_<=df.nvars-1;i_++)
761	{
762	x[i_] = xy[i,i_];
763	}
764	dfprocess(ref df, ref x, ref y);
765	if( df.nclasses>1 )
766	{
767
768	//
769	// classification-specific code
770	//
771	k = (int)Math.Round(xy[i,df.nvars]);
772	for(j=0; j<=df.nclasses-1; j++)
773	{
774	if( j==k )
775	{
776	result = result+Math.Abs(y[j]-1);
777	}
778	else
779	{
780	result = result+Math.Abs(y[j]);
781	}
782	}
783	}
784	else
785	{
786
787	//
788	// regression-specific code
789	//
790	result = result+Math.Abs(y[0]-xy[i,df.nvars]);
791	}
792	}
793	result = result/(npoints*df.nclasses);
794	return result;
795	}
796
797
798	/*************************************************************************
799	Average relative error on the test set
800
801	INPUT PARAMETERS:
802	DF - decision forest model
803	XY - test set
804	NPoints - test set size
805
806	RESULT:
807	Its meaning for regression task is obvious. As for
808	classification task, it means average relative error when estimating
809	posterior probability of belonging to the correct class.
810
811	-- ALGLIB --
812	Copyright 16.02.2009 by Bochkanov Sergey
813	*************************************************************************/
814	public static double dfavgrelerror(ref decisionforest df,
815	ref double[,] xy,
816	int npoints)
817	{
818	double result = 0;
819	double[] x = new double[0];
820	double[] y = new double[0];
821	int relcnt = 0;
822	int i = 0;
823	int j = 0;
824	int k = 0;
825	int i_ = 0;
826
827	x = new double[df.nvars-1+1];
828	y = new double[df.nclasses-1+1];
829	result = 0;
830	relcnt = 0;
831	for(i=0; i<=npoints-1; i++)
832	{
833	for(i_=0; i_<=df.nvars-1;i_++)
834	{
835	x[i_] = xy[i,i_];
836	}
837	dfprocess(ref df, ref x, ref y);
838	if( df.nclasses>1 )
839	{
840
841	//
842	// classification-specific code
843	//
844	k = (int)Math.Round(xy[i,df.nvars]);
845	for(j=0; j<=df.nclasses-1; j++)
846	{
847	if( j==k )
848	{
849	result = result+Math.Abs(y[j]-1);
850	relcnt = relcnt+1;
851	}
852	}
853	}
854	else
855	{
856
857	//
858	// regression-specific code
859	//
860	if( (double)(xy[i,df.nvars])!=(double)(0) )
861	{
862	result = result+Math.Abs((y[0]-xy[i,df.nvars])/xy[i,df.nvars]);
863	relcnt = relcnt+1;
864	}
865	}
866	}
867	if( relcnt>0 )
868	{
869	result = result/relcnt;
870	}
871	return result;
872	}
873
874
875	/*************************************************************************
876	Copying of DecisionForest strucure
877
878	INPUT PARAMETERS:
879	DF1 - original
880
881	OUTPUT PARAMETERS:
882	DF2 - copy
883
884	-- ALGLIB --
885	Copyright 13.02.2009 by Bochkanov Sergey
886	*************************************************************************/
887	public static void dfcopy(ref decisionforest df1,
888	ref decisionforest df2)
889	{
890	int i_ = 0;
891
892	df2.nvars = df1.nvars;
893	df2.nclasses = df1.nclasses;
894	df2.ntrees = df1.ntrees;
895	df2.bufsize = df1.bufsize;
896	df2.trees = new double[df1.bufsize-1+1];
897	for(i_=0; i_<=df1.bufsize-1;i_++)
898	{
899	df2.trees[i_] = df1.trees[i_];
900	}
901	}
902
903
904	/*************************************************************************
905	Serialization of DecisionForest strucure
906
907	INPUT PARAMETERS:
908	DF - original
909
910	OUTPUT PARAMETERS:
911	RA - array of real numbers which stores decision forest,
912	array[0..RLen-1]
913	RLen - RA lenght
914
915	-- ALGLIB --
916	Copyright 13.02.2009 by Bochkanov Sergey
917	*************************************************************************/
918	public static void dfserialize(ref decisionforest df,
919	ref double[] ra,
920	ref int rlen)
921	{
922	int i_ = 0;
923	int i1_ = 0;
924
925	ra = new double[df.bufsize+5-1+1];
926	ra[0] = dfvnum;
927	ra[1] = df.nvars;
928	ra[2] = df.nclasses;
929	ra[3] = df.ntrees;
930	ra[4] = df.bufsize;
931	i1_ = (0) - (5);
932	for(i_=5; i_<=5+df.bufsize-1;i_++)
933	{
934	ra[i_] = df.trees[i_+i1_];
935	}
936	rlen = 5+df.bufsize;
937	}
938
939
940	/*************************************************************************
941	Unserialization of DecisionForest strucure
942
943	INPUT PARAMETERS:
944	RA - real array which stores decision forest
945
946	OUTPUT PARAMETERS:
947	DF - restored structure
948
949	-- ALGLIB --
950	Copyright 13.02.2009 by Bochkanov Sergey
951	*************************************************************************/
952	public static void dfunserialize(ref double[] ra,
953	ref decisionforest df)
954	{
955	int i_ = 0;
956	int i1_ = 0;
957
958	System.Diagnostics.Debug.Assert((int)Math.Round(ra[0])==dfvnum, "DFUnserialize: incorrect array!");
959	df.nvars = (int)Math.Round(ra[1]);
960	df.nclasses = (int)Math.Round(ra[2]);
961	df.ntrees = (int)Math.Round(ra[3]);
962	df.bufsize = (int)Math.Round(ra[4]);
963	df.trees = new double[df.bufsize-1+1];
964	i1_ = (5) - (0);
965	for(i_=0; i_<=df.bufsize-1;i_++)
966	{
967	df.trees[i_] = ra[i_+i1_];
968	}
969	}
970
971
972	/*************************************************************************
973	Classification error
974	*************************************************************************/
975	private static int dfclserror(ref decisionforest df,
976	ref double[,] xy,
977	int npoints)
978	{
979	int result = 0;
980	double[] x = new double[0];
981	double[] y = new double[0];
982	int i = 0;
983	int j = 0;
984	int k = 0;
985	int tmpi = 0;
986	int i_ = 0;
987
988	if( df.nclasses<=1 )
989	{
990	result = 0;
991	return result;
992	}
993	x = new double[df.nvars-1+1];
994	y = new double[df.nclasses-1+1];
995	result = 0;
996	for(i=0; i<=npoints-1; i++)
997	{
998	for(i_=0; i_<=df.nvars-1;i_++)
999	{
1000	x[i_] = xy[i,i_];
1001	}
1002	dfprocess(ref df, ref x, ref y);
1003	k = (int)Math.Round(xy[i,df.nvars]);
1004	tmpi = 0;
1005	for(j=1; j<=df.nclasses-1; j++)
1006	{
1007	if( (double)(y[j])>(double)(y[tmpi]) )
1008	{
1009	tmpi = j;
1010	}
1011	}
1012	if( tmpi!=k )
1013	{
1014	result = result+1;
1015	}
1016	}
1017	return result;
1018	}
1019
1020
1021	/*************************************************************************
1022	Internal subroutine for processing one decision tree starting at Offs
1023	*************************************************************************/
1024	private static void dfprocessinternal(ref decisionforest df,
1025	int offs,
1026	ref double[] x,
1027	ref double[] y)
1028	{
1029	int i = 0;
1030	int k = 0;
1031	int idx = 0;
1032
1033
1034	//
1035	// Set pointer to the root
1036	//
1037	k = offs+1;
1038
1039	//
1040	// Navigate through the tree
1041	//
1042	while( true )
1043	{
1044	if( (double)(df.trees[k])==(double)(-1) )
1045	{
1046	if( df.nclasses==1 )
1047	{
1048	y[0] = y[0]+df.trees[k+1];
1049	}
1050	else
1051	{
1052	idx = (int)Math.Round(df.trees[k+1]);
1053	y[idx] = y[idx]+1;
1054	}
1055	break;
1056	}
1057	if( (double)(x[(int)Math.Round(df.trees[k])])<(double)(df.trees[k+1]) )
1058	{
1059	k = k+innernodewidth;
1060	}
1061	else
1062	{
1063	k = offs+(int)Math.Round(df.trees[k+2]);
1064	}
1065	}
1066	}
1067
1068
1069	/*************************************************************************
1070	Builds one decision tree. Just a wrapper for the DFBuildTreeRec.
1071	*************************************************************************/
1072	private static void dfbuildtree(ref double[,] xy,
1073	int npoints,
1074	int nvars,
1075	int nclasses,
1076	int nfeatures,
1077	int nvarsinpool,
1078	int flags,
1079	ref dfinternalbuffers bufs)
1080	{
1081	int numprocessed = 0;
1082	int i = 0;
1083
1084	System.Diagnostics.Debug.Assert(npoints>0);
1085
1086	//
1087	// Prepare IdxBuf. It stores indices of the training set elements.
1088	// When training set is being split, contents of IdxBuf is
1089	// correspondingly reordered so we can know which elements belong
1090	// to which branch of decision tree.
1091	//
1092	for(i=0; i<=npoints-1; i++)
1093	{
1094	bufs.idxbuf[i] = i;
1095	}
1096
1097	//
1098	// Recursive procedure
1099	//
1100	numprocessed = 1;
1101	dfbuildtreerec(ref xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, ref numprocessed, 0, npoints-1, ref bufs);
1102	bufs.treebuf[0] = numprocessed;
1103	}
1104
1105
1106	/*************************************************************************
1107	Builds one decision tree (internal recursive subroutine)
1108
1109	Parameters:
1110	TreeBuf - large enough array, at least TreeSize
1111	IdxBuf - at least NPoints elements
1112	TmpBufR - at least NPoints
1113	TmpBufR2 - at least NPoints
1114	TmpBufI - at least NPoints
1115	TmpBufI2 - at least NPoints+1
1116	*************************************************************************/
1117	private static void dfbuildtreerec(ref double[,] xy,
1118	int npoints,
1119	int nvars,
1120	int nclasses,
1121	int nfeatures,
1122	int nvarsinpool,
1123	int flags,
1124	ref int numprocessed,
1125	int idx1,
1126	int idx2,
1127	ref dfinternalbuffers bufs)
1128	{
1129	int i = 0;
1130	int j = 0;
1131	int k = 0;
1132	bool bflag = new bool();
1133	int offs = 0;
1134	int i1 = 0;
1135	int i2 = 0;
1136	int lsize = 0;
1137	int info = 0;
1138	double sl = 0;
1139	double sr = 0;
1140	double w = 0;
1141	int idxbest = 0;
1142	double ebest = 0;
1143	double tbest = 0;
1144	int varcur = 0;
1145	double s = 0;
1146	double v = 0;
1147	double v1 = 0;
1148	double v2 = 0;
1149	int nbuf = 0;
1150	double threshold = 0;
1151	int oldnp = 0;
1152	double e = 0;
1153	double currms = 0;
1154	double curcvrms = 0;
1155	bool useevs = new bool();
1156
1157	System.Diagnostics.Debug.Assert(npoints>0);
1158	System.Diagnostics.Debug.Assert(idx2>=idx1);
1159	useevs = flags/dfuseevs%2!=0;
1160
1161	//
1162	// Leaf node
1163	//
1164	if( idx2==idx1 )
1165	{
1166	bufs.treebuf[numprocessed] = -1;
1167	bufs.treebuf[numprocessed+1] = xy[bufs.idxbuf[idx1],nvars];
1168	numprocessed = numprocessed+leafnodewidth;
1169	return;
1170	}
1171
1172	//
1173	// Non-leaf node.
1174	// Select random variable, prepare split:
1175	// 1. prepare default solution - no splitting, class at random
1176	// 2. investigate possible splits, compare with default/best
1177	//
1178	idxbest = -1;
1179	if( nclasses>1 )
1180	{
1181
1182	//
1183	// default solution for classification
1184	//
1185	for(i=0; i<=nclasses-1; i++)
1186	{
1187	bufs.classibuf[i] = 0;
1188	}
1189	s = idx2-idx1+1;
1190	for(i=idx1; i<=idx2; i++)
1191	{
1192	j = (int)Math.Round(xy[bufs.idxbuf[i],nvars]);
1193	bufs.classibuf[j] = bufs.classibuf[j]+1;
1194	}
1195	ebest = 0;
1196	for(i=0; i<=nclasses-1; i++)
1197	{
1198	ebest = ebest+bufs.classibuf[i]AP.Math.Sqr(1-bufs.classibuf[i]/s)+(s-bufs.classibuf[i])AP.Math.Sqr(bufs.classibuf[i]/s);
1199	}
1200	ebest = Math.Sqrt(ebest/(nclasses*(idx2-idx1+1)));
1201	}
1202	else
1203	{
1204
1205	//
1206	// default solution for regression
1207	//
1208	v = 0;
1209	for(i=idx1; i<=idx2; i++)
1210	{
1211	v = v+xy[bufs.idxbuf[i],nvars];
1212	}
1213	v = v/(idx2-idx1+1);
1214	ebest = 0;
1215	for(i=idx1; i<=idx2; i++)
1216	{
1217	ebest = ebest+AP.Math.Sqr(xy[bufs.idxbuf[i],nvars]-v);
1218	}
1219	ebest = Math.Sqrt(ebest/(idx2-idx1+1));
1220	}
1221	i = 0;
1222	while( i<=Math.Min(nfeatures, nvarsinpool)-1 )
1223	{
1224
1225	//
1226	// select variables from pool
1227	//
1228	j = i+AP.Math.RandomInteger(nvarsinpool-i);
1229	k = bufs.varpool[i];
1230	bufs.varpool[i] = bufs.varpool[j];
1231	bufs.varpool[j] = k;
1232	varcur = bufs.varpool[i];
1233
1234	//
1235	// load variable values to working array
1236	//
1237	// apply EVS preprocessing: if all variable values are same,
1238	// variable is excluded from pool.
1239	//
1240	// This is necessary for binary pre-splits (see later) to work.
1241	//
1242	for(j=idx1; j<=idx2; j++)
1243	{
1244	bufs.tmpbufr[j-idx1] = xy[bufs.idxbuf[j],varcur];
1245	}
1246	if( useevs )
1247	{
1248	bflag = false;
1249	v = bufs.tmpbufr[0];
1250	for(j=0; j<=idx2-idx1; j++)
1251	{
1252	if( (double)(bufs.tmpbufr[j])!=(double)(v) )
1253	{
1254	bflag = true;
1255	break;
1256	}
1257	}
1258	if( !bflag )
1259	{
1260
1261	//
1262	// exclude variable from pool,
1263	// go to the next iteration.
1264	// I is not increased.
1265	//
1266	k = bufs.varpool[i];
1267	bufs.varpool[i] = bufs.varpool[nvarsinpool-1];
1268	bufs.varpool[nvarsinpool-1] = k;
1269	nvarsinpool = nvarsinpool-1;
1270	continue;
1271	}
1272	}
1273
1274	//
1275	// load labels to working array
1276	//
1277	if( nclasses>1 )
1278	{
1279	for(j=idx1; j<=idx2; j++)
1280	{
1281	bufs.tmpbufi[j-idx1] = (int)Math.Round(xy[bufs.idxbuf[j],nvars]);
1282	}
1283	}
1284	else
1285	{
1286	for(j=idx1; j<=idx2; j++)
1287	{
1288	bufs.tmpbufr2[j-idx1] = xy[bufs.idxbuf[j],nvars];
1289	}
1290	}
1291
1292	//
1293	// calculate split
1294	//
1295	if( useevs & bufs.evsbin[varcur] )
1296	{
1297
1298	//
1299	// Pre-calculated splits for binary variables.
1300	// Threshold is already known, just calculate RMS error
1301	//
1302	threshold = bufs.evssplits[varcur];
1303	if( nclasses>1 )
1304	{
1305
1306	//
1307	// classification-specific code
1308	//
1309	for(j=0; j<=2*nclasses-1; j++)
1310	{
1311	bufs.classibuf[j] = 0;
1312	}
1313	sl = 0;
1314	sr = 0;
1315	for(j=0; j<=idx2-idx1; j++)
1316	{
1317	k = bufs.tmpbufi[j];
1318	if( (double)(bufs.tmpbufr[j])<(double)(threshold) )
1319	{
1320	bufs.classibuf[k] = bufs.classibuf[k]+1;
1321	sl = sl+1;
1322	}
1323	else
1324	{
1325	bufs.classibuf[k+nclasses] = bufs.classibuf[k+nclasses]+1;
1326	sr = sr+1;
1327	}
1328	}
1329	System.Diagnostics.Debug.Assert((double)(sl)!=(double)(0) & (double)(sr)!=(double)(0), "DFBuildTreeRec: something strange!");
1330	currms = 0;
1331	for(j=0; j<=nclasses-1; j++)
1332	{
1333	w = bufs.classibuf[j];
1334	currms = currms+w*AP.Math.Sqr(w/sl-1);
1335	currms = currms+(sl-w)*AP.Math.Sqr(w/sl);
1336	w = bufs.classibuf[nclasses+j];
1337	currms = currms+w*AP.Math.Sqr(w/sr-1);
1338	currms = currms+(sr-w)*AP.Math.Sqr(w/sr);
1339	}
1340	currms = Math.Sqrt(currms/(nclasses*(idx2-idx1+1)));
1341	}
1342	else
1343	{
1344
1345	//
1346	// regression-specific code
1347	//
1348	sl = 0;
1349	sr = 0;
1350	v1 = 0;
1351	v2 = 0;
1352	for(j=0; j<=idx2-idx1; j++)
1353	{
1354	if( (double)(bufs.tmpbufr[j])<(double)(threshold) )
1355	{
1356	v1 = v1+bufs.tmpbufr2[j];
1357	sl = sl+1;
1358	}
1359	else
1360	{
1361	v2 = v2+bufs.tmpbufr2[j];
1362	sr = sr+1;
1363	}
1364	}
1365	System.Diagnostics.Debug.Assert((double)(sl)!=(double)(0) & (double)(sr)!=(double)(0), "DFBuildTreeRec: something strange!");
1366	v1 = v1/sl;
1367	v2 = v2/sr;
1368	currms = 0;
1369	for(j=0; j<=idx2-idx1; j++)
1370	{
1371	if( (double)(bufs.tmpbufr[j])<(double)(threshold) )
1372	{
1373	currms = currms+AP.Math.Sqr(v1-bufs.tmpbufr2[j]);
1374	}
1375	else
1376	{
1377	currms = currms+AP.Math.Sqr(v2-bufs.tmpbufr2[j]);
1378	}
1379	}
1380	currms = Math.Sqrt(currms/(idx2-idx1+1));
1381	}
1382	info = 1;
1383	}
1384	else
1385	{
1386
1387	//
1388	// Generic splits
1389	//
1390	if( nclasses>1 )
1391	{
1392	dfsplitc(ref bufs.tmpbufr, ref bufs.tmpbufi, ref bufs.classibuf, idx2-idx1+1, nclasses, dfusestrongsplits, ref info, ref threshold, ref currms);
1393	}
1394	else
1395	{
1396	dfsplitr(ref bufs.tmpbufr, ref bufs.tmpbufr2, idx2-idx1+1, dfusestrongsplits, ref info, ref threshold, ref currms);
1397	}
1398	}
1399	if( info>0 )
1400	{
1401	if( (double)(currms)<=(double)(ebest) )
1402	{
1403	ebest = currms;
1404	idxbest = varcur;
1405	tbest = threshold;
1406	}
1407	}
1408
1409	//
1410	// Next iteration
1411	//
1412	i = i+1;
1413	}
1414
1415	//
1416	// to split or not to split
1417	//
1418	if( idxbest<0 )
1419	{
1420
1421	//
1422	// All values are same, cannot split.
1423	//
1424	bufs.treebuf[numprocessed] = -1;
1425	if( nclasses>1 )
1426	{
1427
1428	//
1429	// Select random class label (randomness allows us to
1430	// approximate distribution of the classes)
1431	//
1432	bufs.treebuf[numprocessed+1] = (int)Math.Round(xy[bufs.idxbuf[idx1+AP.Math.RandomInteger(idx2-idx1+1)],nvars]);
1433	}
1434	else
1435	{
1436
1437	//
1438	// Select average (for regression task).
1439	//
1440	v = 0;
1441	for(i=idx1; i<=idx2; i++)
1442	{
1443	v = v+xy[bufs.idxbuf[i],nvars]/(idx2-idx1+1);
1444	}
1445	bufs.treebuf[numprocessed+1] = v;
1446	}
1447	numprocessed = numprocessed+leafnodewidth;
1448	}
1449	else
1450	{
1451
1452	//
1453	// we can split
1454	//
1455	bufs.treebuf[numprocessed] = idxbest;
1456	bufs.treebuf[numprocessed+1] = tbest;
1457	i1 = idx1;
1458	i2 = idx2;
1459	while( i1<=i2 )
1460	{
1461
1462	//
1463	// Reorder indices so that left partition is in [Idx1..I1-1],
1464	// and right partition is in [I2+1..Idx2]
1465	//
1466	if( (double)(xy[bufs.idxbuf[i1],idxbest])<(double)(tbest) )
1467	{
1468	i1 = i1+1;
1469	continue;
1470	}
1471	if( (double)(xy[bufs.idxbuf[i2],idxbest])>=(double)(tbest) )
1472	{
1473	i2 = i2-1;
1474	continue;
1475	}
1476	j = bufs.idxbuf[i1];
1477	bufs.idxbuf[i1] = bufs.idxbuf[i2];
1478	bufs.idxbuf[i2] = j;
1479	i1 = i1+1;
1480	i2 = i2-1;
1481	}
1482	oldnp = numprocessed;
1483	numprocessed = numprocessed+innernodewidth;
1484	dfbuildtreerec(ref xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, ref numprocessed, idx1, i1-1, ref bufs);
1485	bufs.treebuf[oldnp+2] = numprocessed;
1486	dfbuildtreerec(ref xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, ref numprocessed, i2+1, idx2, ref bufs);
1487	}
1488	}
1489
1490
1491	/*************************************************************************
1492	Makes weak split on attribute
1493	*************************************************************************/
1494	private static void dfweakspliti(ref double[] x,
1495	ref int[] y,
1496	int n,
1497	int nclasses,
1498	ref int info,
1499	ref double threshold,
1500	ref double e)
1501	{
1502	int i = 0;
1503	int neq = 0;
1504	int nless = 0;
1505	int ngreater = 0;
1506
1507	tsort.tagsortfasti(ref x, ref y, n);
1508	if( n%2==1 )
1509	{
1510
1511	//
1512	// odd number of elements
1513	//
1514	threshold = x[n/2];
1515	}
1516	else
1517	{
1518
1519	//
1520	// even number of elements.
1521	//
1522	// if two closest to the middle of the array are equal,
1523	// we will select one of them (to avoid possible problems with
1524	// floating point errors).
1525	// we will select halfsum otherwise.
1526	//
1527	if( (double)(x[n/2-1])==(double)(x[n/2]) )
1528	{
1529	threshold = x[n/2-1];
1530	}
1531	else
1532	{
1533	threshold = 0.5*(x[n/2-1]+x[n/2]);
1534	}
1535	}
1536	neq = 0;
1537	nless = 0;
1538	ngreater = 0;
1539	for(i=0; i<=n-1; i++)
1540	{
1541	if( (double)(x[i])<(double)(threshold) )
1542	{
1543	nless = nless+1;
1544	}
1545	if( (double)(x[i])==(double)(threshold) )
1546	{
1547	neq = neq+1;
1548	}
1549	if( (double)(x[i])>(double)(threshold) )
1550	{
1551	ngreater = ngreater+1;
1552	}
1553	}
1554	if( nless==0 & ngreater==0 )
1555	{
1556	info = -3;
1557	}
1558	else
1559	{
1560	if( neq!=0 )
1561	{
1562	if( nless<ngreater )
1563	{
1564	threshold = 0.5*(x[nless+neq-1]+x[nless+neq]);
1565	}
1566	else
1567	{
1568	threshold = 0.5*(x[nless-1]+x[nless]);
1569	}
1570	}
1571	info = 1;
1572	e = 0;
1573	}
1574	}
1575
1576
1577	/*************************************************************************
1578	Makes split on attribute
1579	*************************************************************************/
1580	private static void dfsplitc(ref double[] x,
1581	ref int[] c,
1582	ref int[] cntbuf,
1583	int n,
1584	int nc,
1585	int flags,
1586	ref int info,
1587	ref double threshold,
1588	ref double e)
1589	{
1590	int i = 0;
1591	int neq = 0;
1592	int nless = 0;
1593	int ngreater = 0;
1594	int q = 0;
1595	int qmin = 0;
1596	int qmax = 0;
1597	int qcnt = 0;
1598	double cursplit = 0;
1599	int nleft = 0;
1600	double v = 0;
1601	double cure = 0;
1602	double w = 0;
1603	double sl = 0;
1604	double sr = 0;
1605
1606	tsort.tagsortfasti(ref x, ref c, n);
1607	e = AP.Math.MaxRealNumber;
1608	threshold = 0.5*(x[0]+x[n-1]);
1609	info = -3;
1610	if( flags/dfusestrongsplits%2==0 )
1611	{
1612
1613	//
1614	// weak splits, split at half
1615	//
1616	qcnt = 2;
1617	qmin = 1;
1618	qmax = 1;
1619	}
1620	else
1621	{
1622
1623	//
1624	// strong splits: choose best quartile
1625	//
1626	qcnt = 4;
1627	qmin = 1;
1628	qmax = 3;
1629	}
1630	for(q=qmin; q<=qmax; q++)
1631	{
1632	cursplit = x[n*q/qcnt];
1633	neq = 0;
1634	nless = 0;
1635	ngreater = 0;
1636	for(i=0; i<=n-1; i++)
1637	{
1638	if( (double)(x[i])<(double)(cursplit) )
1639	{
1640	nless = nless+1;
1641	}
1642	if( (double)(x[i])==(double)(cursplit) )
1643	{
1644	neq = neq+1;
1645	}
1646	if( (double)(x[i])>(double)(cursplit) )
1647	{
1648	ngreater = ngreater+1;
1649	}
1650	}
1651	System.Diagnostics.Debug.Assert(neq!=0, "DFSplitR: NEq=0, something strange!!!");
1652	if( nless!=0 \| ngreater!=0 )
1653	{
1654
1655	//
1656	// set threshold between two partitions, with
1657	// some tweaking to avoid problems with floating point
1658	// arithmetics.
1659	//
1660	// The problem is that when you calculates C = 0.5*(A+B) there
1661	// can be no C which lies strictly between A and B (for example,
1662	// there is no floating point number which is
1663	// greater than 1 and less than 1+eps). In such situations
1664	// we choose right side as theshold (remember that
1665	// points which lie on threshold falls to the right side).
1666	//
1667	if( nless<ngreater )
1668	{
1669	cursplit = 0.5*(x[nless+neq-1]+x[nless+neq]);
1670	nleft = nless+neq;
1671	if( (double)(cursplit)<=(double)(x[nless+neq-1]) )
1672	{
1673	cursplit = x[nless+neq];
1674	}
1675	}
1676	else
1677	{
1678	cursplit = 0.5*(x[nless-1]+x[nless]);
1679	nleft = nless;
1680	if( (double)(cursplit)<=(double)(x[nless-1]) )
1681	{
1682	cursplit = x[nless];
1683	}
1684	}
1685	info = 1;
1686	cure = 0;
1687	for(i=0; i<=2*nc-1; i++)
1688	{
1689	cntbuf[i] = 0;
1690	}
1691	for(i=0; i<=nleft-1; i++)
1692	{
1693	cntbuf[c[i]] = cntbuf[c[i]]+1;
1694	}
1695	for(i=nleft; i<=n-1; i++)
1696	{
1697	cntbuf[nc+c[i]] = cntbuf[nc+c[i]]+1;
1698	}
1699	sl = nleft;
1700	sr = n-nleft;
1701	v = 0;
1702	for(i=0; i<=nc-1; i++)
1703	{
1704	w = cntbuf[i];
1705	v = v+w*AP.Math.Sqr(w/sl-1);
1706	v = v+(sl-w)*AP.Math.Sqr(w/sl);
1707	w = cntbuf[nc+i];
1708	v = v+w*AP.Math.Sqr(w/sr-1);
1709	v = v+(sr-w)*AP.Math.Sqr(w/sr);
1710	}
1711	cure = Math.Sqrt(v/(nc*n));
1712	if( (double)(cure)<(double)(e) )
1713	{
1714	threshold = cursplit;
1715	e = cure;
1716	}
1717	}
1718	}
1719	}
1720
1721
1722	/*************************************************************************
1723	Makes split on attribute
1724	*************************************************************************/
1725	private static void dfsplitr(ref double[] x,
1726	ref double[] y,
1727	int n,
1728	int flags,
1729	ref int info,
1730	ref double threshold,
1731	ref double e)
1732	{
1733	int i = 0;
1734	int neq = 0;
1735	int nless = 0;
1736	int ngreater = 0;
1737	int q = 0;
1738	int qmin = 0;
1739	int qmax = 0;
1740	int qcnt = 0;
1741	double cursplit = 0;
1742	int nleft = 0;
1743	double v = 0;
1744	double cure = 0;
1745
1746	tsort.tagsortfastr(ref x, ref y, n);
1747	e = AP.Math.MaxRealNumber;
1748	threshold = 0.5*(x[0]+x[n-1]);
1749	info = -3;
1750	if( flags/dfusestrongsplits%2==0 )
1751	{
1752
1753	//
1754	// weak splits, split at half
1755	//
1756	qcnt = 2;
1757	qmin = 1;
1758	qmax = 1;
1759	}
1760	else
1761	{
1762
1763	//
1764	// strong splits: choose best quartile
1765	//
1766	qcnt = 4;
1767	qmin = 1;
1768	qmax = 3;
1769	}
1770	for(q=qmin; q<=qmax; q++)
1771	{
1772	cursplit = x[n*q/qcnt];
1773	neq = 0;
1774	nless = 0;
1775	ngreater = 0;
1776	for(i=0; i<=n-1; i++)
1777	{
1778	if( (double)(x[i])<(double)(cursplit) )
1779	{
1780	nless = nless+1;
1781	}
1782	if( (double)(x[i])==(double)(cursplit) )
1783	{
1784	neq = neq+1;
1785	}
1786	if( (double)(x[i])>(double)(cursplit) )
1787	{
1788	ngreater = ngreater+1;
1789	}
1790	}
1791	System.Diagnostics.Debug.Assert(neq!=0, "DFSplitR: NEq=0, something strange!!!");
1792	if( nless!=0 \| ngreater!=0 )
1793	{
1794
1795	//
1796	// set threshold between two partitions, with
1797	// some tweaking to avoid problems with floating point
1798	// arithmetics.
1799	//
1800	// The problem is that when you calculates C = 0.5*(A+B) there
1801	// can be no C which lies strictly between A and B (for example,
1802	// there is no floating point number which is
1803	// greater than 1 and less than 1+eps). In such situations
1804	// we choose right side as theshold (remember that
1805	// points which lie on threshold falls to the right side).
1806	//
1807	if( nless<ngreater )
1808	{
1809	cursplit = 0.5*(x[nless+neq-1]+x[nless+neq]);
1810	nleft = nless+neq;
1811	if( (double)(cursplit)<=(double)(x[nless+neq-1]) )
1812	{
1813	cursplit = x[nless+neq];
1814	}
1815	}
1816	else
1817	{
1818	cursplit = 0.5*(x[nless-1]+x[nless]);
1819	nleft = nless;
1820	if( (double)(cursplit)<=(double)(x[nless-1]) )
1821	{
1822	cursplit = x[nless];
1823	}
1824	}
1825	info = 1;
1826	cure = 0;
1827	v = 0;
1828	for(i=0; i<=nleft-1; i++)
1829	{
1830	v = v+y[i];
1831	}
1832	v = v/nleft;
1833	for(i=0; i<=nleft-1; i++)
1834	{
1835	cure = cure+AP.Math.Sqr(y[i]-v);
1836	}
1837	v = 0;
1838	for(i=nleft; i<=n-1; i++)
1839	{
1840	v = v+y[i];
1841	}
1842	v = v/(n-nleft);
1843	for(i=nleft; i<=n-1; i++)
1844	{
1845	cure = cure+AP.Math.Sqr(y[i]-v);
1846	}
1847	cure = Math.Sqrt(cure/n);
1848	if( (double)(cure)<(double)(e) )
1849	{
1850	threshold = cursplit;
1851	e = cure;
1852	}
1853	}
1854	}
1855	}
1856	}
1857	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences