Context Navigation

source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/flann/include/flann/algorithms/kdtree_cuda_builder.h @ 15840

Visit:

Last change on this file since 15840 was 15840, checked in by gkronber, 6 years ago
#2886 added utility console program for clustering of expressions
File size: 29.0 KB

Line
1	/***********************************************************************
2	* Software License Agreement (BSD License)
3	*
4	* Copyright 2011 Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved.
5	*
6	* THE BSD LICENSE
7	*
8	* Redistribution and use in source and binary forms, with or without
9	* modification, are permitted provided that the following conditions
10	* are met:
11	*
12	* 1. Redistributions of source code must retain the above copyright
13	* notice, this list of conditions and the following disclaimer.
14	* 2. Redistributions in binary form must reproduce the above copyright
15	* notice, this list of conditions and the following disclaimer in the
16	* documentation and/or other materials provided with the distribution.
17	*
18	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28	*************************************************************************/
29
30	#ifndef FLANN_CUDA_KD_TREE_BUILDER_H_
31	#define FLANN_CUDA_KD_TREE_BUILDER_H_
32	#include <thrust/host_vector.h>
33	#include <thrust/device_vector.h>
34	#include <thrust/sort.h>
35	#include <thrust/partition.h>
36	#include <thrust/unique.h>
37	#include <thrust/scan.h>
38	#include <flann/util/cutil_math.h>
39	#include <stdlib.h>
40
41	// #define PRINT_DEBUG_TIMING
42
43	namespace flann
44	{
45	// template< typename T >
46	// void print_vector( const thrust::device_vector<T>& v )
47	// {
48	// for( int i=0; i< v.size(); i++ )
49	// {
50	// std::cout<<v[i]<<std::endl;
51	// }
52	// }
53	//
54	// template< typename T1, typename T2 >
55	// void print_vector( const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2 )
56	// {
57	// for( int i=0; i< v1.size(); i++ )
58	// {
59	// std::cout<<i<<": "<<v1[i]<<" "<<v2[i]<<std::endl;
60	// }
61	// }
62	//
63	// template< typename T1, typename T2, typename T3 >
64	// void print_vector( const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2, const thrust::device_vector<T3>& v3 )
65	// {
66	// for( int i=0; i< v1.size(); i++ )
67	// {
68	// std::cout<<i<<": "<<v1[i]<<" "<<v2[i]<<" "<<v3[i]<<std::endl;
69	// }
70	// }
71	//
72	// template< typename T >
73	// void print_vector_by_index( const thrust::device_vector<T>& v,const thrust::device_vector<int>& ind )
74	// {
75	// for( int i=0; i< v.size(); i++ )
76	// {
77	// std::cout<<v[ind[i]]<<std::endl;
78	// }
79	// }
80
81	// std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::SplitInfo& s) {
82	// stream<<"(split l/r: "<< s.left <<" "<< s.right<< " split:"<<s.split_dim<<" "<<s.split_val<<")";
83	// return stream;
84	// }
85	//
86	//
87	// std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::NodeInfo& s) {
88	// stream<<"(node: "<<s.child1()<<" "<<s.parent()<<" "<<s.child2()<<")";
89	// return stream;
90	// }
91	//
92	// std::ostream& operator <<(std::ostream& stream, const float4& s) {
93	// stream<<"("<<s.x<<","<<s.y<<","<<s.z<<","<<s.w<<")";
94	// return stream;
95	// }
96	namespace cuda
97	{
98	namespace kd_tree_builder_detail
99	{
100	//! normal node: contains the split dimension and value
101	//! leaf node: left == index of first points, right==index of last point +1
102	struct SplitInfo
103	{
104	union {
105	struct
106	{
107	// begin of child nodes
108	int left;
109	// end of child nodes
110	int right;
111	};
112	struct
113	{
114	int split_dim;
115	float split_val;
116	};
117	};
118
119	};
120
121	struct IsEven
122	{
123	typedef int result_type;
124	__device__
125	int operator()(int i )
126	{
127	return (i& 1)==0;
128	}
129	};
130
131	struct SecondElementIsEven
132	{
133	__host__ __device__
134	bool operator()( const thrust::tuple<int,int>& i )
135	{
136	return (thrust::get<1>(i)& 1)==0;
137	}
138	};
139
140	//! just for convenience: access a float4 by an index in [0,1,2]
141	//! (casting it to a float* and accessing it by the index is way slower...)
142	__host__ __device__
143	float get_value_by_index( const float4& f, int i )
144	{
145	switch(i) {
146	case 0:
147	return f.x;
148	case 1:
149	return f.y;
150	default:
151	return f.z;
152	}
153
154	}
155
156	//! mark a point as belonging to the left or right child of its current parent
157	//! called after parents are split
158	struct MovePointsToChildNodes
159	{
160	MovePointsToChildNodes( int* child1, SplitInfo* splits, float* x, float* y, float* z, int* ox, int* oy, int* oz, int* lrx, int* lry, int* lrz )
161	: child1_(child1), splits_(splits), x_(x), y_(y), z_(z), ox_(ox), oy_(oy), oz_(oz), lrx_(lrx), lry_(lry), lrz_(lrz){}
162	// int dim;
163	// float threshold;
164	int* child1_;
165	SplitInfo* splits_;
166
167	// coordinate values
168	float* x_, * y_, * z_;
169	// owner indices -> which node does the point belong to?
170	int* ox_, * oy_, * oz_;
171	// temp info: will be set to 1 of a point is moved to the right child node, 0 otherwise
172	// (used later in the scan op to separate the points of the children into continuous ranges)
173	int* lrx_, * lry_, * lrz_;
174	__device__
175	void operator()( const thrust::tuple<int, int, int, int>& data )
176	{
177	int index = thrust::get<0>(data);
178	int owner = ox_[index]; // before a split, all points at the same position in the index array have the same owner
179	int point_ind1=thrust::get<1>(data);
180	int point_ind2=thrust::get<2>(data);
181	int point_ind3=thrust::get<3>(data);
182	int leftChild=child1_[owner];
183	int split_dim;
184	float dim_val1, dim_val2, dim_val3;
185	SplitInfo split;
186	lrx_[index]=0;
187	lry_[index]=0;
188	lrz_[index]=0;
189	// this element already belongs to a leaf node -> everything alright, no need to change anything
190	if( leftChild==-1 ) {
191	return;
192	}
193	// otherwise: load split data, and assign this index to the new owner
194	split = splits_[owner];
195	split_dim=split.split_dim;
196	switch( split_dim ) {
197	case 0:
198	dim_val1=x_[point_ind1];
199	dim_val2=x_[point_ind2];
200	dim_val3=x_[point_ind3];
201	break;
202	case 1:
203	dim_val1=y_[point_ind1];
204	dim_val2=y_[point_ind2];
205	dim_val3=y_[point_ind3];
206	break;
207	default:
208	dim_val1=z_[point_ind1];
209	dim_val2=z_[point_ind2];
210	dim_val3=z_[point_ind3];
211	break;
212
213	}
214
215
216	int r1=leftChild +(dim_val1 > split.split_val);
217	ox_[index]=r1;
218	int r2=leftChild+(dim_val2 > split.split_val);
219	oy_[index]=r2;
220	oz_[index]=leftChild+(dim_val3 > split.split_val);
221
222	lrx_[index] = (dim_val1 > split.split_val);
223	lry_[index] = (dim_val2 > split.split_val);
224	lrz_[index] = (dim_val3 > split.split_val);
225	// return thrust::make_tuple( r1, r2, leftChild+(dim_val > split.split_val) );
226	}
227	};
228
229	//! used to update the left/right pointers and aabb infos after the node splits
230	struct SetLeftAndRightAndAABB
231	{
232	int maxPoints;
233	int nElements;
234
235	SplitInfo* nodes;
236	int* counts;
237	int* labels;
238	float4* aabbMin;
239	float4* aabbMax;
240	const float* x,* y,* z;
241	const int* ix, * iy, * iz;
242
243	__host__ __device__
244	void operator()( int i )
245	{
246	int index=labels[i];
247	int right;
248	int left = counts[i];
249	nodes[index].left=left;
250	if( i < nElements-1 ) {
251	right=counts[i+1];
252	}
253	else { // index==nNodes
254	right=maxPoints;
255	}
256	nodes[index].right=right;
257	aabbMin[index].x=x[ix[left]];
258	aabbMin[index].y=y[iy[left]];
259	aabbMin[index].z=z[iz[left]];
260	aabbMax[index].x=x[ix[right-1]];
261	aabbMax[index].y=y[iy[right-1]];
262	aabbMax[index].z=z[iz[right-1]];
263	}
264	};
265
266
267	//! - decide whether a node has to be split
268	//! if yes:
269	//! - allocate child nodes
270	//! - set split axis as axis of maximum aabb length
271	struct SplitNodes
272	{
273	int maxPointsPerNode;
274	int* node_count;
275	int* nodes_allocated;
276	int* out_of_space;
277	int* child1_;
278	int* parent_;
279	SplitInfo* splits;
280
281	__device__
282	void operator()( thrust::tuple<int&, int&,SplitInfo&,float4&,float4&, int> node ) // float4: aabbMin, aabbMax
283	{
284	int& parent=thrust::get<0>(node);
285	int& child1=thrust::get<1>(node);
286	SplitInfo& s=thrust::get<2>(node);
287	const float4& aabbMin=thrust::get<3>(node);
288	const float4& aabbMax=thrust::get<4>(node);
289	int my_index = thrust::get<5>(node);
290	bool split_node=false;
291	// first, each thread block counts the number of nodes that it needs to allocate...
292	__shared__ int block_nodes_to_allocate;
293	if( threadIdx.x== 0 ) block_nodes_to_allocate=0;
294	__syncthreads();
295
296	// don't split if all points are equal
297	// (could lead to an infinite loop, and doesn't make any sense anyway)
298	bool all_points_in_node_are_equal=aabbMin.x == aabbMax.x && aabbMin.y==aabbMax.y && aabbMin.z==aabbMax.z;
299
300	int offset_to_global=0;
301
302	// maybe this could be replaced with a reduction...
303	if(( child1==-1) &&( s.right-s.left > maxPointsPerNode) && !all_points_in_node_are_equal ) { // leaf node
304	split_node=true;
305	offset_to_global = atomicAdd( &block_nodes_to_allocate,2 );
306	}
307
308	__syncthreads();
309	__shared__ int block_left;
310	__shared__ bool enough_space;
311	// ... then the first thread tries to allocate this many nodes...
312	if( threadIdx.x==0) {
313	block_left = atomicAdd( node_count, block_nodes_to_allocate );
314	enough_space = block_left+block_nodes_to_allocate < *nodes_allocated;
315	// if it doesn't succeed, no nodes will be created by this block
316	if( !enough_space ) {
317	atomicAdd( node_count, -block_nodes_to_allocate );
318	*out_of_space=1;
319	}
320	}
321
322	__syncthreads();
323	// this thread needs to split it's node && there was enough space for all the nodes
324	// in this block.
325	//(The whole "allocate-per-block-thing" is much faster than letting each element allocate
326	// its space on its own, because shared memory atomics are A LOT faster than
327	// global mem atomics!)
328	if( split_node && enough_space ) {
329	int left = block_left + offset_to_global;
330
331	splits[left].left=s.left;
332	splits[left].right=s.right;
333	splits[left+1].left=0;
334	splits[left+1].right=0;
335
336	// split axis/position: middle of longest aabb extent
337	float4 aabbDim=aabbMax-aabbMin;
338	int maxDim=0;
339	float maxDimLength=aabbDim.x;
340	float4 splitVal=(aabbMax+aabbMin);
341	splitVal*=0.5f;
342	for( int i=1; i<=2; i++ ) {
343	float val = get_value_by_index(aabbDim,i);
344	if( val > maxDimLength ) {
345	maxDim=i;
346	maxDimLength=val;
347	}
348	}
349	s.split_dim=maxDim;
350	s.split_val=get_value_by_index(splitVal,maxDim);
351
352	child1_[my_index]=left;
353	splits[my_index]=s;
354
355	parent_[left]=my_index;
356	parent_[left+1]=my_index;
357	child1_[left]=-1;
358	child1_[left+1]=-1;
359	}
360	}
361	};
362
363
364	//! computes the scatter target address for the split operation, see Sengupta,Harris,Zhang,Owen: Scan Primitives for GPU Computing
365	//! in my use case, this is about 2x as fast as thrust::partition
366	struct set_addr3
367	{
368	const int* val_, * f_;
369
370	int npoints_;
371	__device__
372	int operator()( int id )
373	{
374	int nf = f_[npoints_-1] + (val_[npoints_-1]);
375	int f=f_[id];
376	int t = id -f+nf;
377	return val_[id] ? f : t;
378	}
379	};
380
381	//! converts a float4 point (xyz) to a tuple of three float vals (used to separate the
382	//! float4 input buffer into three arrays in the beginning of the tree build)
383	struct pointxyz_to_px_py_pz
384	{
385	__device__
386	thrust::tuple<float,float,float> operator()( const float4& val )
387	{
388	return thrust::make_tuple(val.x, val.y, val.z);
389	}
390	};
391	} // namespace kd_tree_builder_detail
392
393	} // namespace cuda
394
395
396	std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::SplitInfo& s)
397	{
398	stream<<"(split l/r: "<< s.left <<" "<< s.right<< " split:"<<s.split_dim<<" "<<s.split_val<<")";
399	return stream;
400	}
401	class CudaKdTreeBuilder
402	{
403	public:
404	CudaKdTreeBuilder( const thrust::device_vector<float4>& points, int max_leaf_size ) : /out_of_space_(1,0),node_count_(1,1),/ max_leaf_size_(max_leaf_size)
405	{
406	points_=&points;
407	int prealloc = points.size()/max_leaf_size_*16;
408	allocation_info_.resize(3);
409	allocation_info_[NodeCount]=1;
410	allocation_info_[NodesAllocated]=prealloc;
411	allocation_info_[OutOfSpace]=0;
412
413	// std::cout<<points_->size()<<std::endl;
414
415	child1_=new thrust::device_vector<int>(prealloc,-1);
416	parent_=new thrust::device_vector<int>(prealloc,-1);
417	cuda::kd_tree_builder_detail::SplitInfo s;
418	s.left=0;
419	s.right=0;
420	splits_=new thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>(prealloc,s);
421	s.right=points.size();
422	(*splits_)[0]=s;
423
424	aabb_min_=new thrust::device_vector<float4>(prealloc);
425	aabb_max_=new thrust::device_vector<float4>(prealloc);
426
427	index_x_=new thrust::device_vector<int>(points_->size());
428	index_y_=new thrust::device_vector<int>(points_->size());
429	index_z_=new thrust::device_vector<int>(points_->size());
430
431	owners_x_=new thrust::device_vector<int>(points_->size(),0);
432	owners_y_=new thrust::device_vector<int>(points_->size(),0);
433	owners_z_=new thrust::device_vector<int>(points_->size(),0);
434
435	leftright_x_ = new thrust::device_vector<int>(points_->size(),0);
436	leftright_y_ = new thrust::device_vector<int>(points_->size(),0);
437	leftright_z_ = new thrust::device_vector<int>(points_->size(),0);
438
439	tmp_index_=new thrust::device_vector<int>(points_->size());
440	tmp_owners_=new thrust::device_vector<int>(points_->size());
441	tmp_misc_=new thrust::device_vector<int>(points_->size());
442
443	points_x_=new thrust::device_vector<float>(points_->size());
444	points_y_=new thrust::device_vector<float>(points_->size());
445	points_z_=new thrust::device_vector<float>(points_->size());
446	delete_node_info_=false;
447	}
448
449	~CudaKdTreeBuilder()
450	{
451	if( delete_node_info_ ) {
452	delete child1_;
453	delete parent_;
454	delete splits_;
455	delete aabb_min_;
456	delete aabb_max_;
457	delete index_x_;
458	}
459
460	delete index_y_;
461	delete index_z_;
462	delete owners_x_;
463	delete owners_y_;
464	delete owners_z_;
465	delete points_x_;
466	delete points_y_;
467	delete points_z_;
468	delete leftright_x_;
469	delete leftright_y_;
470	delete leftright_z_;
471	delete tmp_index_;
472	delete tmp_owners_;
473	delete tmp_misc_;
474	}
475
476	//! build the tree
477	//! general idea:
478	//! - build sorted lists of the points in x y and z order (to be able to compute tight AABBs in O(1) )
479	//! - while( nodes to split exist )
480	//! - split non-child nodes along longest axis if the number of points is > max_points_per_node
481	//! - for each point: determine whether it is in a node that was split. If yes, mark it as belonging to the left or right child node of its current parent node
482	//! - reorder the points so that the points of a single node are continuous in the node array
483	//! - update the left/right pointers and AABBs of all nodes
484	void buildTree()
485	{
486	// std::cout<<"buildTree()"<<std::endl;
487	// sleep(1);
488	// Util::Timer stepTimer;
489	thrust::transform( points_->begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() );
490
491	thrust::counting_iterator<int> it(0);
492	thrust::copy( it, it+points_->size(), index_x_->begin() );
493
494	thrust::copy( index_x_->begin(), index_x_->end(), index_y_->begin() );
495	thrust::copy( index_x_->begin(), index_x_->end(), index_z_->begin() );
496
497	thrust::device_vector<float> tmpv(points_->size());
498
499	// create sorted index list -> can be used to compute AABBs in O(1)
500	thrust::copy(points_x_->begin(), points_x_->end(), tmpv.begin());
501	thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_x_->begin() );
502	thrust::copy(points_y_->begin(), points_y_->end(), tmpv.begin());
503	thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_y_->begin() );
504	thrust::copy(points_z_->begin(), points_z_->end(), tmpv.begin());
505	thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_z_->begin() );
506
507
508	(aabb_min_)[0]=make_float4((points_x_)[(index_x_)[0]],(points_y_)[(index_y_)[0]],(points_z_)[(*index_z_)[0]],0);
509
510	(aabb_max_)[0]=make_float4((points_x_)[(index_x_)[points_->size()-1]],(points_y_)[(index_y_)[points_->size()-1]],(points_z_)[(*index_z_)[points_->size()-1]],0);
511	#ifdef PRINT_DEBUG_TIMING
512	cudaDeviceSynchronize();
513	std::cout<<" initial stuff:"<<stepTimer.elapsed()<<std::endl;
514	stepTimer.restart();
515	#endif
516	int last_node_count=0;
517	for( int i=0;; i++ ) {
518	cuda::kd_tree_builder_detail::SplitNodes sn;
519
520	sn.maxPointsPerNode=max_leaf_size_;
521	sn.node_count=thrust::raw_pointer_cast(&allocation_info_[NodeCount]);
522	sn.nodes_allocated=thrust::raw_pointer_cast(&allocation_info_[NodesAllocated]);
523	sn.out_of_space=thrust::raw_pointer_cast(&allocation_info_[OutOfSpace]);
524	sn.child1_=thrust::raw_pointer_cast(&(*child1_)[0]);
525	sn.parent_=thrust::raw_pointer_cast(&(*parent_)[0]);
526	sn.splits=thrust::raw_pointer_cast(&(*splits_)[0]);
527
528	thrust::counting_iterator<int> cit(0);
529	thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(), splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit )),
530	thrust::make_zip_iterator(thrust::make_tuple( parent_->begin()+last_node_count, child1_->begin()+last_node_count,splits_->begin()+last_node_count, aabb_min_->begin()+last_node_count, aabb_max_->begin()+last_node_count,cit+last_node_count )),
531	sn );
532	// copy allocation info to host
533	thrust::host_vector<int> alloc_info = allocation_info_;
534
535	if( last_node_count == alloc_info[NodeCount] ) { // no more nodes were split -> done
536	break;
537	}
538	last_node_count=alloc_info[NodeCount];
539
540	// a node was un-splittable due to a lack of space
541	if( alloc_info[OutOfSpace]==1 ) {
542	resize_node_vectors(alloc_info[NodesAllocated]*2);
543	alloc_info[OutOfSpace]=0;
544	alloc_info[NodesAllocated]*=2;
545	allocation_info_=alloc_info;
546	}
547	#ifdef PRINT_DEBUG_TIMING
548	cudaDeviceSynchronize();
549	std::cout<<" node split:"<<stepTimer.elapsed()<<std::endl;
550	stepTimer.restart();
551	#endif
552
553	// foreach point: point was in node that was split?move it to child (leaf) node : do nothing
554	cuda::kd_tree_builder_detail::MovePointsToChildNodes sno( thrust::raw_pointer_cast(&(*child1_)[0]),
555	thrust::raw_pointer_cast(&(*splits_)[0]),
556	thrust::raw_pointer_cast(&(*points_x_)[0]),
557	thrust::raw_pointer_cast(&(*points_y_)[0]),
558	thrust::raw_pointer_cast(&(*points_z_)[0]),
559	thrust::raw_pointer_cast(&(*owners_x_)[0]),
560	thrust::raw_pointer_cast(&(*owners_y_)[0]),
561	thrust::raw_pointer_cast(&(*owners_z_)[0]),
562	thrust::raw_pointer_cast(&(*leftright_x_)[0]),
563	thrust::raw_pointer_cast(&(*leftright_y_)[0]),
564	thrust::raw_pointer_cast(&(*leftright_z_)[0])
565	);
566	thrust::counting_iterator<int> ci0(0);
567	thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ),
568	thrust::make_zip_iterator( thrust::make_tuple( ci0+points_->size(), index_x_->end(), index_y_->end(), index_z_->end()) ),sno );
569
570	#ifdef PRINT_DEBUG_TIMING
571	cudaDeviceSynchronize();
572	std::cout<<" set new owners:"<<stepTimer.elapsed()<<std::endl;
573	stepTimer.restart();
574	#endif
575
576	// move points around so that each leaf node's points are continuous
577	separate_left_and_right_children(index_x_,owners_x_,tmp_index_,tmp_owners_, *leftright_x_);
578	std::swap(tmp_index_, index_x_);
579	std::swap(tmp_owners_, owners_x_);
580	separate_left_and_right_children(index_y_,owners_y_,tmp_index_,tmp_owners_, *leftright_y_,false);
581	std::swap(tmp_index_, index_y_);
582	separate_left_and_right_children(index_z_,owners_z_,tmp_index_,tmp_owners_, *leftright_z_,false);
583	std::swap(tmp_index_, index_z_);
584
585	#ifdef PRINT_DEBUG_TIMING
586	cudaDeviceSynchronize();
587	std::cout<<" split:"<<stepTimer.elapsed()<<std::endl;
588	stepTimer.restart();
589	#endif
590	// calculate new AABB etc
591	update_leftright_and_aabb( points_x_, points_y_, points_z_, index_x_, index_y_, index_z_, owners_x_, splits_,aabb_min_, aabb_max_);
592	#ifdef PRINT_DEBUG_TIMING
593	cudaDeviceSynchronize();
594	std::cout<<" update_leftright_and_aabb:"<<stepTimer.elapsed()<<std::endl;
595	stepTimer.restart();
596	print_vector(node_count_);
597	#endif
598
599	}
600	}
601
602	template<class Distance>
603	friend class KDTreeCuda3dIndex;
604
605	protected:
606
607
608	//! takes the partitioned nodes, and sets the left-/right info of leaf nodes, as well as the AABBs
609	void
610	update_leftright_and_aabb( const thrust::device_vector<float>& x, const thrust::device_vector<float>& y,const thrust::device_vector<float>& z,
611	const thrust::device_vector<int>& ix, const thrust::device_vector<int>& iy,const thrust::device_vector<int>& iz,
612	const thrust::device_vector<int>& owners,
613	thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>& splits, thrust::device_vector<float4>& aabbMin,thrust::device_vector<float4>& aabbMax)
614	{
615	thrust::device_vector<int>* labelsUnique=tmp_owners_;
616	thrust::device_vector<int>* countsUnique=tmp_index_;
617	// assume: points of each node are continuous in the array
618
619	// find which nodes are here, and where each node's points begin and end
620	int unique_labels = thrust::unique_by_key_copy( owners.begin(), owners.end(), thrust::counting_iterator<int>(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin();
621
622	// update the info
623	cuda::kd_tree_builder_detail::SetLeftAndRightAndAABB s;
624	s.maxPoints=x.size();
625	s.nElements=unique_labels;
626	s.nodes=thrust::raw_pointer_cast(&(splits[0]));
627	s.counts=thrust::raw_pointer_cast(&( (*countsUnique)[0]));
628	s.labels=thrust::raw_pointer_cast(&( (*labelsUnique)[0]));
629	s.x=thrust::raw_pointer_cast(&x[0]);
630	s.y=thrust::raw_pointer_cast(&y[0]);
631	s.z=thrust::raw_pointer_cast(&z[0]);
632	s.ix=thrust::raw_pointer_cast(&ix[0]);
633	s.iy=thrust::raw_pointer_cast(&iy[0]);
634	s.iz=thrust::raw_pointer_cast(&iz[0]);
635	s.aabbMin=thrust::raw_pointer_cast(&aabbMin[0]);
636	s.aabbMax=thrust::raw_pointer_cast(&aabbMax[0]);
637
638	thrust::counting_iterator<int> it(0);
639	thrust::for_each(it, it+unique_labels, s);
640	}
641
642	//! Separates the left and right children of each node into continuous parts of the array.
643	//! More specifically, it seperates children with even and odd node indices because nodes are always
644	//! allocated in pairs -> child1==child2+1 -> child1 even and child2 odd, or vice-versa.
645	//! Since the split operation is stable, this results in continuous partitions
646	//! for all the single nodes.
647	//! (basically the split primitive according to sengupta et al)
648	//! about twice as fast as thrust::partition
649	void separate_left_and_right_children( thrust::device_vector<int>& key_in, thrust::device_vector<int>& val_in, thrust::device_vector<int>& key_out, thrust::device_vector<int>& val_out, thrust::device_vector<int>& left_right_marks, bool scatter_val_out=true )
650	{
651	thrust::device_vector<int>* f_tmp = &val_out;
652	thrust::device_vector<int>* addr_tmp = tmp_misc_;
653
654	thrust::exclusive_scan( /thrust::make_transform_iterator(/ left_right_marks.begin() /,cuda::kd_tree_builder_detail::IsEven/
655	/())/, /thrust::make_transform_iterator(/ left_right_marks.end() /,cuda::kd_tree_builder_detail::IsEven/
656	/())/, f_tmp->begin() );
657	cuda::kd_tree_builder_detail::set_addr3 sa;
658	sa.val_=thrust::raw_pointer_cast(&left_right_marks[0]);
659	sa.f_=thrust::raw_pointer_cast(&(*f_tmp)[0]);
660	sa.npoints_=key_in.size();
661	thrust::counting_iterator<int> it(0);
662	thrust::transform(it, it+val_in.size(), addr_tmp->begin(), sa);
663
664	thrust::scatter(key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin());
665	if( scatter_val_out ) thrust::scatter(val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin());
666	}
667
668	//! allocates additional space in all the node-related vectors.
669	//! new_size elements will be added to all vectors.
670	void resize_node_vectors( size_t new_size )
671	{
672	size_t add = new_size - child1_->size();
673	child1_->insert(child1_->end(), add, -1);
674	parent_->insert(parent_->end(), add, -1);
675	cuda::kd_tree_builder_detail::SplitInfo s;
676	s.left=0;
677	s.right=0;
678	splits_->insert(splits_->end(), add, s);
679	float4 f;
680	aabb_min_->insert(aabb_min_->end(), add, f);
681	aabb_max_->insert(aabb_max_->end(), add, f);
682	}
683
684
685	const thrust::device_vector<float4>* points_;
686
687	// tree data, those are stored per-node
688
689	//! left child of each node. (right child==left child + 1, due to the alloc mechanism)
690	//! child1_[node]==-1 if node is a leaf node
691	thrust::device_vector<int>* child1_;
692	//! parent node of each node
693	thrust::device_vector<int>* parent_;
694	//! split info (dim/value or left/right pointers)
695	thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>* splits_;
696	//! min aabb value of each node
697	thrust::device_vector<float4>* aabb_min_;
698	//! max aabb value of each node
699	thrust::device_vector<float4>* aabb_max_;
700
701	enum AllocationInfo
702	{
703	NodeCount=0,
704	NodesAllocated=1,
705	OutOfSpace=2
706	};
707	// those were put into a single vector of 3 elements so that only one mem transfer will be needed for all three of them
708	// thrust::device_vector<int> out_of_space_;
709	// thrust::device_vector<int> node_count_;
710	// thrust::device_vector<int> nodes_allocated_;
711	thrust::device_vector<int> allocation_info_;
712
713	int max_leaf_size_;
714
715	// coordinate values of the points
716	thrust::device_vector<float>* points_x_, * points_y_, * points_z_;
717	// indices
718	thrust::device_vector<int>* index_x_, * index_y_, * index_z_;
719	// owner node
720	thrust::device_vector<int>* owners_x_, * owners_y_, * owners_z_;
721	// contains info about whether a point was partitioned to the left or right child after a split
722	thrust::device_vector<int>* leftright_x_, * leftright_y_, * leftright_z_;
723	thrust::device_vector<int>* tmp_index_, * tmp_owners_, * tmp_misc_;
724	bool delete_node_info_;
725	};
726
727
728	} // namespace flann
729	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences