1 | /*********************************************************************** |
---|
2 | * Software License Agreement (BSD License) |
---|
3 | * |
---|
4 | * Copyright 2011 Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved. |
---|
5 | * |
---|
6 | * THE BSD LICENSE |
---|
7 | * |
---|
8 | * Redistribution and use in source and binary forms, with or without |
---|
9 | * modification, are permitted provided that the following conditions |
---|
10 | * are met: |
---|
11 | * |
---|
12 | * 1. Redistributions of source code must retain the above copyright |
---|
13 | * notice, this list of conditions and the following disclaimer. |
---|
14 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
15 | * notice, this list of conditions and the following disclaimer in the |
---|
16 | * documentation and/or other materials provided with the distribution. |
---|
17 | * |
---|
18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
---|
19 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
---|
20 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
---|
21 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
---|
22 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
---|
23 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
---|
24 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
---|
25 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
---|
26 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
---|
27 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
28 | *************************************************************************/ |
---|
29 | |
---|
30 | #ifndef FLANN_CUDA_KD_TREE_BUILDER_H_ |
---|
31 | #define FLANN_CUDA_KD_TREE_BUILDER_H_ |
---|
32 | #include <thrust/host_vector.h> |
---|
33 | #include <thrust/device_vector.h> |
---|
34 | #include <thrust/sort.h> |
---|
35 | #include <thrust/partition.h> |
---|
36 | #include <thrust/unique.h> |
---|
37 | #include <thrust/scan.h> |
---|
38 | #include <flann/util/cutil_math.h> |
---|
39 | #include <stdlib.h> |
---|
40 | |
---|
41 | // #define PRINT_DEBUG_TIMING |
---|
42 | |
---|
43 | namespace flann |
---|
44 | { |
---|
45 | // template< typename T > |
---|
46 | // void print_vector( const thrust::device_vector<T>& v ) |
---|
47 | // { |
---|
48 | // for( int i=0; i< v.size(); i++ ) |
---|
49 | // { |
---|
50 | // std::cout<<v[i]<<std::endl; |
---|
51 | // } |
---|
52 | // } |
---|
53 | // |
---|
54 | // template< typename T1, typename T2 > |
---|
55 | // void print_vector( const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2 ) |
---|
56 | // { |
---|
57 | // for( int i=0; i< v1.size(); i++ ) |
---|
58 | // { |
---|
59 | // std::cout<<i<<": "<<v1[i]<<" "<<v2[i]<<std::endl; |
---|
60 | // } |
---|
61 | // } |
---|
62 | // |
---|
63 | // template< typename T1, typename T2, typename T3 > |
---|
64 | // void print_vector( const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2, const thrust::device_vector<T3>& v3 ) |
---|
65 | // { |
---|
66 | // for( int i=0; i< v1.size(); i++ ) |
---|
67 | // { |
---|
68 | // std::cout<<i<<": "<<v1[i]<<" "<<v2[i]<<" "<<v3[i]<<std::endl; |
---|
69 | // } |
---|
70 | // } |
---|
71 | // |
---|
72 | // template< typename T > |
---|
73 | // void print_vector_by_index( const thrust::device_vector<T>& v,const thrust::device_vector<int>& ind ) |
---|
74 | // { |
---|
75 | // for( int i=0; i< v.size(); i++ ) |
---|
76 | // { |
---|
77 | // std::cout<<v[ind[i]]<<std::endl; |
---|
78 | // } |
---|
79 | // } |
---|
80 | |
---|
81 | // std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::SplitInfo& s) { |
---|
82 | // stream<<"(split l/r: "<< s.left <<" "<< s.right<< " split:"<<s.split_dim<<" "<<s.split_val<<")"; |
---|
83 | // return stream; |
---|
84 | // } |
---|
85 | // |
---|
86 | // |
---|
87 | // std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::NodeInfo& s) { |
---|
88 | // stream<<"(node: "<<s.child1()<<" "<<s.parent()<<" "<<s.child2()<<")"; |
---|
89 | // return stream; |
---|
90 | // } |
---|
91 | // |
---|
92 | // std::ostream& operator <<(std::ostream& stream, const float4& s) { |
---|
93 | // stream<<"("<<s.x<<","<<s.y<<","<<s.z<<","<<s.w<<")"; |
---|
94 | // return stream; |
---|
95 | // } |
---|
96 | namespace cuda |
---|
97 | { |
---|
98 | namespace kd_tree_builder_detail |
---|
99 | { |
---|
100 | //! normal node: contains the split dimension and value |
---|
101 | //! leaf node: left == index of first points, right==index of last point +1 |
---|
102 | struct SplitInfo |
---|
103 | { |
---|
104 | union { |
---|
105 | struct |
---|
106 | { |
---|
107 | // begin of child nodes |
---|
108 | int left; |
---|
109 | // end of child nodes |
---|
110 | int right; |
---|
111 | }; |
---|
112 | struct |
---|
113 | { |
---|
114 | int split_dim; |
---|
115 | float split_val; |
---|
116 | }; |
---|
117 | }; |
---|
118 | |
---|
119 | }; |
---|
120 | |
---|
121 | struct IsEven |
---|
122 | { |
---|
123 | typedef int result_type; |
---|
124 | __device__ |
---|
125 | int operator()(int i ) |
---|
126 | { |
---|
127 | return (i& 1)==0; |
---|
128 | } |
---|
129 | }; |
---|
130 | |
---|
131 | struct SecondElementIsEven |
---|
132 | { |
---|
133 | __host__ __device__ |
---|
134 | bool operator()( const thrust::tuple<int,int>& i ) |
---|
135 | { |
---|
136 | return (thrust::get<1>(i)& 1)==0; |
---|
137 | } |
---|
138 | }; |
---|
139 | |
---|
140 | //! just for convenience: access a float4 by an index in [0,1,2] |
---|
141 | //! (casting it to a float* and accessing it by the index is way slower...) |
---|
142 | __host__ __device__ |
---|
143 | float get_value_by_index( const float4& f, int i ) |
---|
144 | { |
---|
145 | switch(i) { |
---|
146 | case 0: |
---|
147 | return f.x; |
---|
148 | case 1: |
---|
149 | return f.y; |
---|
150 | default: |
---|
151 | return f.z; |
---|
152 | } |
---|
153 | |
---|
154 | } |
---|
155 | |
---|
156 | //! mark a point as belonging to the left or right child of its current parent |
---|
157 | //! called after parents are split |
---|
158 | struct MovePointsToChildNodes |
---|
159 | { |
---|
160 | MovePointsToChildNodes( int* child1, SplitInfo* splits, float* x, float* y, float* z, int* ox, int* oy, int* oz, int* lrx, int* lry, int* lrz ) |
---|
161 | : child1_(child1), splits_(splits), x_(x), y_(y), z_(z), ox_(ox), oy_(oy), oz_(oz), lrx_(lrx), lry_(lry), lrz_(lrz){} |
---|
162 | // int dim; |
---|
163 | // float threshold; |
---|
164 | int* child1_; |
---|
165 | SplitInfo* splits_; |
---|
166 | |
---|
167 | // coordinate values |
---|
168 | float* x_, * y_, * z_; |
---|
169 | // owner indices -> which node does the point belong to? |
---|
170 | int* ox_, * oy_, * oz_; |
---|
171 | // temp info: will be set to 1 of a point is moved to the right child node, 0 otherwise |
---|
172 | // (used later in the scan op to separate the points of the children into continuous ranges) |
---|
173 | int* lrx_, * lry_, * lrz_; |
---|
174 | __device__ |
---|
175 | void operator()( const thrust::tuple<int, int, int, int>& data ) |
---|
176 | { |
---|
177 | int index = thrust::get<0>(data); |
---|
178 | int owner = ox_[index]; // before a split, all points at the same position in the index array have the same owner |
---|
179 | int point_ind1=thrust::get<1>(data); |
---|
180 | int point_ind2=thrust::get<2>(data); |
---|
181 | int point_ind3=thrust::get<3>(data); |
---|
182 | int leftChild=child1_[owner]; |
---|
183 | int split_dim; |
---|
184 | float dim_val1, dim_val2, dim_val3; |
---|
185 | SplitInfo split; |
---|
186 | lrx_[index]=0; |
---|
187 | lry_[index]=0; |
---|
188 | lrz_[index]=0; |
---|
189 | // this element already belongs to a leaf node -> everything alright, no need to change anything |
---|
190 | if( leftChild==-1 ) { |
---|
191 | return; |
---|
192 | } |
---|
193 | // otherwise: load split data, and assign this index to the new owner |
---|
194 | split = splits_[owner]; |
---|
195 | split_dim=split.split_dim; |
---|
196 | switch( split_dim ) { |
---|
197 | case 0: |
---|
198 | dim_val1=x_[point_ind1]; |
---|
199 | dim_val2=x_[point_ind2]; |
---|
200 | dim_val3=x_[point_ind3]; |
---|
201 | break; |
---|
202 | case 1: |
---|
203 | dim_val1=y_[point_ind1]; |
---|
204 | dim_val2=y_[point_ind2]; |
---|
205 | dim_val3=y_[point_ind3]; |
---|
206 | break; |
---|
207 | default: |
---|
208 | dim_val1=z_[point_ind1]; |
---|
209 | dim_val2=z_[point_ind2]; |
---|
210 | dim_val3=z_[point_ind3]; |
---|
211 | break; |
---|
212 | |
---|
213 | } |
---|
214 | |
---|
215 | |
---|
216 | int r1=leftChild +(dim_val1 > split.split_val); |
---|
217 | ox_[index]=r1; |
---|
218 | int r2=leftChild+(dim_val2 > split.split_val); |
---|
219 | oy_[index]=r2; |
---|
220 | oz_[index]=leftChild+(dim_val3 > split.split_val); |
---|
221 | |
---|
222 | lrx_[index] = (dim_val1 > split.split_val); |
---|
223 | lry_[index] = (dim_val2 > split.split_val); |
---|
224 | lrz_[index] = (dim_val3 > split.split_val); |
---|
225 | // return thrust::make_tuple( r1, r2, leftChild+(dim_val > split.split_val) ); |
---|
226 | } |
---|
227 | }; |
---|
228 | |
---|
229 | //! used to update the left/right pointers and aabb infos after the node splits |
---|
230 | struct SetLeftAndRightAndAABB |
---|
231 | { |
---|
232 | int maxPoints; |
---|
233 | int nElements; |
---|
234 | |
---|
235 | SplitInfo* nodes; |
---|
236 | int* counts; |
---|
237 | int* labels; |
---|
238 | float4* aabbMin; |
---|
239 | float4* aabbMax; |
---|
240 | const float* x,* y,* z; |
---|
241 | const int* ix, * iy, * iz; |
---|
242 | |
---|
243 | __host__ __device__ |
---|
244 | void operator()( int i ) |
---|
245 | { |
---|
246 | int index=labels[i]; |
---|
247 | int right; |
---|
248 | int left = counts[i]; |
---|
249 | nodes[index].left=left; |
---|
250 | if( i < nElements-1 ) { |
---|
251 | right=counts[i+1]; |
---|
252 | } |
---|
253 | else { // index==nNodes |
---|
254 | right=maxPoints; |
---|
255 | } |
---|
256 | nodes[index].right=right; |
---|
257 | aabbMin[index].x=x[ix[left]]; |
---|
258 | aabbMin[index].y=y[iy[left]]; |
---|
259 | aabbMin[index].z=z[iz[left]]; |
---|
260 | aabbMax[index].x=x[ix[right-1]]; |
---|
261 | aabbMax[index].y=y[iy[right-1]]; |
---|
262 | aabbMax[index].z=z[iz[right-1]]; |
---|
263 | } |
---|
264 | }; |
---|
265 | |
---|
266 | |
---|
267 | //! - decide whether a node has to be split |
---|
268 | //! if yes: |
---|
269 | //! - allocate child nodes |
---|
270 | //! - set split axis as axis of maximum aabb length |
---|
271 | struct SplitNodes |
---|
272 | { |
---|
273 | int maxPointsPerNode; |
---|
274 | int* node_count; |
---|
275 | int* nodes_allocated; |
---|
276 | int* out_of_space; |
---|
277 | int* child1_; |
---|
278 | int* parent_; |
---|
279 | SplitInfo* splits; |
---|
280 | |
---|
281 | __device__ |
---|
282 | void operator()( thrust::tuple<int&, int&,SplitInfo&,float4&,float4&, int> node ) // float4: aabbMin, aabbMax |
---|
283 | { |
---|
284 | int& parent=thrust::get<0>(node); |
---|
285 | int& child1=thrust::get<1>(node); |
---|
286 | SplitInfo& s=thrust::get<2>(node); |
---|
287 | const float4& aabbMin=thrust::get<3>(node); |
---|
288 | const float4& aabbMax=thrust::get<4>(node); |
---|
289 | int my_index = thrust::get<5>(node); |
---|
290 | bool split_node=false; |
---|
291 | // first, each thread block counts the number of nodes that it needs to allocate... |
---|
292 | __shared__ int block_nodes_to_allocate; |
---|
293 | if( threadIdx.x== 0 ) block_nodes_to_allocate=0; |
---|
294 | __syncthreads(); |
---|
295 | |
---|
296 | // don't split if all points are equal |
---|
297 | // (could lead to an infinite loop, and doesn't make any sense anyway) |
---|
298 | bool all_points_in_node_are_equal=aabbMin.x == aabbMax.x && aabbMin.y==aabbMax.y && aabbMin.z==aabbMax.z; |
---|
299 | |
---|
300 | int offset_to_global=0; |
---|
301 | |
---|
302 | // maybe this could be replaced with a reduction... |
---|
303 | if(( child1==-1) &&( s.right-s.left > maxPointsPerNode) && !all_points_in_node_are_equal ) { // leaf node |
---|
304 | split_node=true; |
---|
305 | offset_to_global = atomicAdd( &block_nodes_to_allocate,2 ); |
---|
306 | } |
---|
307 | |
---|
308 | __syncthreads(); |
---|
309 | __shared__ int block_left; |
---|
310 | __shared__ bool enough_space; |
---|
311 | // ... then the first thread tries to allocate this many nodes... |
---|
312 | if( threadIdx.x==0) { |
---|
313 | block_left = atomicAdd( node_count, block_nodes_to_allocate ); |
---|
314 | enough_space = block_left+block_nodes_to_allocate < *nodes_allocated; |
---|
315 | // if it doesn't succeed, no nodes will be created by this block |
---|
316 | if( !enough_space ) { |
---|
317 | atomicAdd( node_count, -block_nodes_to_allocate ); |
---|
318 | *out_of_space=1; |
---|
319 | } |
---|
320 | } |
---|
321 | |
---|
322 | __syncthreads(); |
---|
323 | // this thread needs to split it's node && there was enough space for all the nodes |
---|
324 | // in this block. |
---|
325 | //(The whole "allocate-per-block-thing" is much faster than letting each element allocate |
---|
326 | // its space on its own, because shared memory atomics are A LOT faster than |
---|
327 | // global mem atomics!) |
---|
328 | if( split_node && enough_space ) { |
---|
329 | int left = block_left + offset_to_global; |
---|
330 | |
---|
331 | splits[left].left=s.left; |
---|
332 | splits[left].right=s.right; |
---|
333 | splits[left+1].left=0; |
---|
334 | splits[left+1].right=0; |
---|
335 | |
---|
336 | // split axis/position: middle of longest aabb extent |
---|
337 | float4 aabbDim=aabbMax-aabbMin; |
---|
338 | int maxDim=0; |
---|
339 | float maxDimLength=aabbDim.x; |
---|
340 | float4 splitVal=(aabbMax+aabbMin); |
---|
341 | splitVal*=0.5f; |
---|
342 | for( int i=1; i<=2; i++ ) { |
---|
343 | float val = get_value_by_index(aabbDim,i); |
---|
344 | if( val > maxDimLength ) { |
---|
345 | maxDim=i; |
---|
346 | maxDimLength=val; |
---|
347 | } |
---|
348 | } |
---|
349 | s.split_dim=maxDim; |
---|
350 | s.split_val=get_value_by_index(splitVal,maxDim); |
---|
351 | |
---|
352 | child1_[my_index]=left; |
---|
353 | splits[my_index]=s; |
---|
354 | |
---|
355 | parent_[left]=my_index; |
---|
356 | parent_[left+1]=my_index; |
---|
357 | child1_[left]=-1; |
---|
358 | child1_[left+1]=-1; |
---|
359 | } |
---|
360 | } |
---|
361 | }; |
---|
362 | |
---|
363 | |
---|
364 | //! computes the scatter target address for the split operation, see Sengupta,Harris,Zhang,Owen: Scan Primitives for GPU Computing |
---|
365 | //! in my use case, this is about 2x as fast as thrust::partition |
---|
366 | struct set_addr3 |
---|
367 | { |
---|
368 | const int* val_, * f_; |
---|
369 | |
---|
370 | int npoints_; |
---|
371 | __device__ |
---|
372 | int operator()( int id ) |
---|
373 | { |
---|
374 | int nf = f_[npoints_-1] + (val_[npoints_-1]); |
---|
375 | int f=f_[id]; |
---|
376 | int t = id -f+nf; |
---|
377 | return val_[id] ? f : t; |
---|
378 | } |
---|
379 | }; |
---|
380 | |
---|
381 | //! converts a float4 point (xyz) to a tuple of three float vals (used to separate the |
---|
382 | //! float4 input buffer into three arrays in the beginning of the tree build) |
---|
383 | struct pointxyz_to_px_py_pz |
---|
384 | { |
---|
385 | __device__ |
---|
386 | thrust::tuple<float,float,float> operator()( const float4& val ) |
---|
387 | { |
---|
388 | return thrust::make_tuple(val.x, val.y, val.z); |
---|
389 | } |
---|
390 | }; |
---|
391 | } // namespace kd_tree_builder_detail |
---|
392 | |
---|
393 | } // namespace cuda |
---|
394 | |
---|
395 | |
---|
396 | std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::SplitInfo& s) |
---|
397 | { |
---|
398 | stream<<"(split l/r: "<< s.left <<" "<< s.right<< " split:"<<s.split_dim<<" "<<s.split_val<<")"; |
---|
399 | return stream; |
---|
400 | } |
---|
401 | class CudaKdTreeBuilder |
---|
402 | { |
---|
403 | public: |
---|
404 | CudaKdTreeBuilder( const thrust::device_vector<float4>& points, int max_leaf_size ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size) |
---|
405 | { |
---|
406 | points_=&points; |
---|
407 | int prealloc = points.size()/max_leaf_size_*16; |
---|
408 | allocation_info_.resize(3); |
---|
409 | allocation_info_[NodeCount]=1; |
---|
410 | allocation_info_[NodesAllocated]=prealloc; |
---|
411 | allocation_info_[OutOfSpace]=0; |
---|
412 | |
---|
413 | // std::cout<<points_->size()<<std::endl; |
---|
414 | |
---|
415 | child1_=new thrust::device_vector<int>(prealloc,-1); |
---|
416 | parent_=new thrust::device_vector<int>(prealloc,-1); |
---|
417 | cuda::kd_tree_builder_detail::SplitInfo s; |
---|
418 | s.left=0; |
---|
419 | s.right=0; |
---|
420 | splits_=new thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>(prealloc,s); |
---|
421 | s.right=points.size(); |
---|
422 | (*splits_)[0]=s; |
---|
423 | |
---|
424 | aabb_min_=new thrust::device_vector<float4>(prealloc); |
---|
425 | aabb_max_=new thrust::device_vector<float4>(prealloc); |
---|
426 | |
---|
427 | index_x_=new thrust::device_vector<int>(points_->size()); |
---|
428 | index_y_=new thrust::device_vector<int>(points_->size()); |
---|
429 | index_z_=new thrust::device_vector<int>(points_->size()); |
---|
430 | |
---|
431 | owners_x_=new thrust::device_vector<int>(points_->size(),0); |
---|
432 | owners_y_=new thrust::device_vector<int>(points_->size(),0); |
---|
433 | owners_z_=new thrust::device_vector<int>(points_->size(),0); |
---|
434 | |
---|
435 | leftright_x_ = new thrust::device_vector<int>(points_->size(),0); |
---|
436 | leftright_y_ = new thrust::device_vector<int>(points_->size(),0); |
---|
437 | leftright_z_ = new thrust::device_vector<int>(points_->size(),0); |
---|
438 | |
---|
439 | tmp_index_=new thrust::device_vector<int>(points_->size()); |
---|
440 | tmp_owners_=new thrust::device_vector<int>(points_->size()); |
---|
441 | tmp_misc_=new thrust::device_vector<int>(points_->size()); |
---|
442 | |
---|
443 | points_x_=new thrust::device_vector<float>(points_->size()); |
---|
444 | points_y_=new thrust::device_vector<float>(points_->size()); |
---|
445 | points_z_=new thrust::device_vector<float>(points_->size()); |
---|
446 | delete_node_info_=false; |
---|
447 | } |
---|
448 | |
---|
449 | ~CudaKdTreeBuilder() |
---|
450 | { |
---|
451 | if( delete_node_info_ ) { |
---|
452 | delete child1_; |
---|
453 | delete parent_; |
---|
454 | delete splits_; |
---|
455 | delete aabb_min_; |
---|
456 | delete aabb_max_; |
---|
457 | delete index_x_; |
---|
458 | } |
---|
459 | |
---|
460 | delete index_y_; |
---|
461 | delete index_z_; |
---|
462 | delete owners_x_; |
---|
463 | delete owners_y_; |
---|
464 | delete owners_z_; |
---|
465 | delete points_x_; |
---|
466 | delete points_y_; |
---|
467 | delete points_z_; |
---|
468 | delete leftright_x_; |
---|
469 | delete leftright_y_; |
---|
470 | delete leftright_z_; |
---|
471 | delete tmp_index_; |
---|
472 | delete tmp_owners_; |
---|
473 | delete tmp_misc_; |
---|
474 | } |
---|
475 | |
---|
476 | //! build the tree |
---|
477 | //! general idea: |
---|
478 | //! - build sorted lists of the points in x y and z order (to be able to compute tight AABBs in O(1) ) |
---|
479 | //! - while( nodes to split exist ) |
---|
480 | //! - split non-child nodes along longest axis if the number of points is > max_points_per_node |
---|
481 | //! - for each point: determine whether it is in a node that was split. If yes, mark it as belonging to the left or right child node of its current parent node |
---|
482 | //! - reorder the points so that the points of a single node are continuous in the node array |
---|
483 | //! - update the left/right pointers and AABBs of all nodes |
---|
484 | void buildTree() |
---|
485 | { |
---|
486 | // std::cout<<"buildTree()"<<std::endl; |
---|
487 | // sleep(1); |
---|
488 | // Util::Timer stepTimer; |
---|
489 | thrust::transform( points_->begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() ); |
---|
490 | |
---|
491 | thrust::counting_iterator<int> it(0); |
---|
492 | thrust::copy( it, it+points_->size(), index_x_->begin() ); |
---|
493 | |
---|
494 | thrust::copy( index_x_->begin(), index_x_->end(), index_y_->begin() ); |
---|
495 | thrust::copy( index_x_->begin(), index_x_->end(), index_z_->begin() ); |
---|
496 | |
---|
497 | thrust::device_vector<float> tmpv(points_->size()); |
---|
498 | |
---|
499 | // create sorted index list -> can be used to compute AABBs in O(1) |
---|
500 | thrust::copy(points_x_->begin(), points_x_->end(), tmpv.begin()); |
---|
501 | thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_x_->begin() ); |
---|
502 | thrust::copy(points_y_->begin(), points_y_->end(), tmpv.begin()); |
---|
503 | thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_y_->begin() ); |
---|
504 | thrust::copy(points_z_->begin(), points_z_->end(), tmpv.begin()); |
---|
505 | thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_z_->begin() ); |
---|
506 | |
---|
507 | |
---|
508 | (*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0); |
---|
509 | |
---|
510 | (*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0); |
---|
511 | #ifdef PRINT_DEBUG_TIMING |
---|
512 | cudaDeviceSynchronize(); |
---|
513 | std::cout<<" initial stuff:"<<stepTimer.elapsed()<<std::endl; |
---|
514 | stepTimer.restart(); |
---|
515 | #endif |
---|
516 | int last_node_count=0; |
---|
517 | for( int i=0;; i++ ) { |
---|
518 | cuda::kd_tree_builder_detail::SplitNodes sn; |
---|
519 | |
---|
520 | sn.maxPointsPerNode=max_leaf_size_; |
---|
521 | sn.node_count=thrust::raw_pointer_cast(&allocation_info_[NodeCount]); |
---|
522 | sn.nodes_allocated=thrust::raw_pointer_cast(&allocation_info_[NodesAllocated]); |
---|
523 | sn.out_of_space=thrust::raw_pointer_cast(&allocation_info_[OutOfSpace]); |
---|
524 | sn.child1_=thrust::raw_pointer_cast(&(*child1_)[0]); |
---|
525 | sn.parent_=thrust::raw_pointer_cast(&(*parent_)[0]); |
---|
526 | sn.splits=thrust::raw_pointer_cast(&(*splits_)[0]); |
---|
527 | |
---|
528 | thrust::counting_iterator<int> cit(0); |
---|
529 | thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(), splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit )), |
---|
530 | thrust::make_zip_iterator(thrust::make_tuple( parent_->begin()+last_node_count, child1_->begin()+last_node_count,splits_->begin()+last_node_count, aabb_min_->begin()+last_node_count, aabb_max_->begin()+last_node_count,cit+last_node_count )), |
---|
531 | sn ); |
---|
532 | // copy allocation info to host |
---|
533 | thrust::host_vector<int> alloc_info = allocation_info_; |
---|
534 | |
---|
535 | if( last_node_count == alloc_info[NodeCount] ) { // no more nodes were split -> done |
---|
536 | break; |
---|
537 | } |
---|
538 | last_node_count=alloc_info[NodeCount]; |
---|
539 | |
---|
540 | // a node was un-splittable due to a lack of space |
---|
541 | if( alloc_info[OutOfSpace]==1 ) { |
---|
542 | resize_node_vectors(alloc_info[NodesAllocated]*2); |
---|
543 | alloc_info[OutOfSpace]=0; |
---|
544 | alloc_info[NodesAllocated]*=2; |
---|
545 | allocation_info_=alloc_info; |
---|
546 | } |
---|
547 | #ifdef PRINT_DEBUG_TIMING |
---|
548 | cudaDeviceSynchronize(); |
---|
549 | std::cout<<" node split:"<<stepTimer.elapsed()<<std::endl; |
---|
550 | stepTimer.restart(); |
---|
551 | #endif |
---|
552 | |
---|
553 | // foreach point: point was in node that was split?move it to child (leaf) node : do nothing |
---|
554 | cuda::kd_tree_builder_detail::MovePointsToChildNodes sno( thrust::raw_pointer_cast(&(*child1_)[0]), |
---|
555 | thrust::raw_pointer_cast(&(*splits_)[0]), |
---|
556 | thrust::raw_pointer_cast(&(*points_x_)[0]), |
---|
557 | thrust::raw_pointer_cast(&(*points_y_)[0]), |
---|
558 | thrust::raw_pointer_cast(&(*points_z_)[0]), |
---|
559 | thrust::raw_pointer_cast(&(*owners_x_)[0]), |
---|
560 | thrust::raw_pointer_cast(&(*owners_y_)[0]), |
---|
561 | thrust::raw_pointer_cast(&(*owners_z_)[0]), |
---|
562 | thrust::raw_pointer_cast(&(*leftright_x_)[0]), |
---|
563 | thrust::raw_pointer_cast(&(*leftright_y_)[0]), |
---|
564 | thrust::raw_pointer_cast(&(*leftright_z_)[0]) |
---|
565 | ); |
---|
566 | thrust::counting_iterator<int> ci0(0); |
---|
567 | thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ), |
---|
568 | thrust::make_zip_iterator( thrust::make_tuple( ci0+points_->size(), index_x_->end(), index_y_->end(), index_z_->end()) ),sno ); |
---|
569 | |
---|
570 | #ifdef PRINT_DEBUG_TIMING |
---|
571 | cudaDeviceSynchronize(); |
---|
572 | std::cout<<" set new owners:"<<stepTimer.elapsed()<<std::endl; |
---|
573 | stepTimer.restart(); |
---|
574 | #endif |
---|
575 | |
---|
576 | // move points around so that each leaf node's points are continuous |
---|
577 | separate_left_and_right_children(*index_x_,*owners_x_,*tmp_index_,*tmp_owners_, *leftright_x_); |
---|
578 | std::swap(tmp_index_, index_x_); |
---|
579 | std::swap(tmp_owners_, owners_x_); |
---|
580 | separate_left_and_right_children(*index_y_,*owners_y_,*tmp_index_,*tmp_owners_, *leftright_y_,false); |
---|
581 | std::swap(tmp_index_, index_y_); |
---|
582 | separate_left_and_right_children(*index_z_,*owners_z_,*tmp_index_,*tmp_owners_, *leftright_z_,false); |
---|
583 | std::swap(tmp_index_, index_z_); |
---|
584 | |
---|
585 | #ifdef PRINT_DEBUG_TIMING |
---|
586 | cudaDeviceSynchronize(); |
---|
587 | std::cout<<" split:"<<stepTimer.elapsed()<<std::endl; |
---|
588 | stepTimer.restart(); |
---|
589 | #endif |
---|
590 | // calculate new AABB etc |
---|
591 | update_leftright_and_aabb( *points_x_, *points_y_, *points_z_, *index_x_, *index_y_, *index_z_, *owners_x_, *splits_,*aabb_min_, *aabb_max_); |
---|
592 | #ifdef PRINT_DEBUG_TIMING |
---|
593 | cudaDeviceSynchronize(); |
---|
594 | std::cout<<" update_leftright_and_aabb:"<<stepTimer.elapsed()<<std::endl; |
---|
595 | stepTimer.restart(); |
---|
596 | print_vector(node_count_); |
---|
597 | #endif |
---|
598 | |
---|
599 | } |
---|
600 | } |
---|
601 | |
---|
602 | template<class Distance> |
---|
603 | friend class KDTreeCuda3dIndex; |
---|
604 | |
---|
605 | protected: |
---|
606 | |
---|
607 | |
---|
608 | //! takes the partitioned nodes, and sets the left-/right info of leaf nodes, as well as the AABBs |
---|
609 | void |
---|
610 | update_leftright_and_aabb( const thrust::device_vector<float>& x, const thrust::device_vector<float>& y,const thrust::device_vector<float>& z, |
---|
611 | const thrust::device_vector<int>& ix, const thrust::device_vector<int>& iy,const thrust::device_vector<int>& iz, |
---|
612 | const thrust::device_vector<int>& owners, |
---|
613 | thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>& splits, thrust::device_vector<float4>& aabbMin,thrust::device_vector<float4>& aabbMax) |
---|
614 | { |
---|
615 | thrust::device_vector<int>* labelsUnique=tmp_owners_; |
---|
616 | thrust::device_vector<int>* countsUnique=tmp_index_; |
---|
617 | // assume: points of each node are continuous in the array |
---|
618 | |
---|
619 | // find which nodes are here, and where each node's points begin and end |
---|
620 | int unique_labels = thrust::unique_by_key_copy( owners.begin(), owners.end(), thrust::counting_iterator<int>(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin(); |
---|
621 | |
---|
622 | // update the info |
---|
623 | cuda::kd_tree_builder_detail::SetLeftAndRightAndAABB s; |
---|
624 | s.maxPoints=x.size(); |
---|
625 | s.nElements=unique_labels; |
---|
626 | s.nodes=thrust::raw_pointer_cast(&(splits[0])); |
---|
627 | s.counts=thrust::raw_pointer_cast(&( (*countsUnique)[0])); |
---|
628 | s.labels=thrust::raw_pointer_cast(&( (*labelsUnique)[0])); |
---|
629 | s.x=thrust::raw_pointer_cast(&x[0]); |
---|
630 | s.y=thrust::raw_pointer_cast(&y[0]); |
---|
631 | s.z=thrust::raw_pointer_cast(&z[0]); |
---|
632 | s.ix=thrust::raw_pointer_cast(&ix[0]); |
---|
633 | s.iy=thrust::raw_pointer_cast(&iy[0]); |
---|
634 | s.iz=thrust::raw_pointer_cast(&iz[0]); |
---|
635 | s.aabbMin=thrust::raw_pointer_cast(&aabbMin[0]); |
---|
636 | s.aabbMax=thrust::raw_pointer_cast(&aabbMax[0]); |
---|
637 | |
---|
638 | thrust::counting_iterator<int> it(0); |
---|
639 | thrust::for_each(it, it+unique_labels, s); |
---|
640 | } |
---|
641 | |
---|
642 | //! Separates the left and right children of each node into continuous parts of the array. |
---|
643 | //! More specifically, it seperates children with even and odd node indices because nodes are always |
---|
644 | //! allocated in pairs -> child1==child2+1 -> child1 even and child2 odd, or vice-versa. |
---|
645 | //! Since the split operation is stable, this results in continuous partitions |
---|
646 | //! for all the single nodes. |
---|
647 | //! (basically the split primitive according to sengupta et al) |
---|
648 | //! about twice as fast as thrust::partition |
---|
649 | void separate_left_and_right_children( thrust::device_vector<int>& key_in, thrust::device_vector<int>& val_in, thrust::device_vector<int>& key_out, thrust::device_vector<int>& val_out, thrust::device_vector<int>& left_right_marks, bool scatter_val_out=true ) |
---|
650 | { |
---|
651 | thrust::device_vector<int>* f_tmp = &val_out; |
---|
652 | thrust::device_vector<int>* addr_tmp = tmp_misc_; |
---|
653 | |
---|
654 | thrust::exclusive_scan( /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/ |
---|
655 | /*())*/, /*thrust::make_transform_iterator(*/ left_right_marks.end() /*,cuda::kd_tree_builder_detail::IsEven*/ |
---|
656 | /*())*/, f_tmp->begin() ); |
---|
657 | cuda::kd_tree_builder_detail::set_addr3 sa; |
---|
658 | sa.val_=thrust::raw_pointer_cast(&left_right_marks[0]); |
---|
659 | sa.f_=thrust::raw_pointer_cast(&(*f_tmp)[0]); |
---|
660 | sa.npoints_=key_in.size(); |
---|
661 | thrust::counting_iterator<int> it(0); |
---|
662 | thrust::transform(it, it+val_in.size(), addr_tmp->begin(), sa); |
---|
663 | |
---|
664 | thrust::scatter(key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin()); |
---|
665 | if( scatter_val_out ) thrust::scatter(val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin()); |
---|
666 | } |
---|
667 | |
---|
668 | //! allocates additional space in all the node-related vectors. |
---|
669 | //! new_size elements will be added to all vectors. |
---|
670 | void resize_node_vectors( size_t new_size ) |
---|
671 | { |
---|
672 | size_t add = new_size - child1_->size(); |
---|
673 | child1_->insert(child1_->end(), add, -1); |
---|
674 | parent_->insert(parent_->end(), add, -1); |
---|
675 | cuda::kd_tree_builder_detail::SplitInfo s; |
---|
676 | s.left=0; |
---|
677 | s.right=0; |
---|
678 | splits_->insert(splits_->end(), add, s); |
---|
679 | float4 f; |
---|
680 | aabb_min_->insert(aabb_min_->end(), add, f); |
---|
681 | aabb_max_->insert(aabb_max_->end(), add, f); |
---|
682 | } |
---|
683 | |
---|
684 | |
---|
685 | const thrust::device_vector<float4>* points_; |
---|
686 | |
---|
687 | // tree data, those are stored per-node |
---|
688 | |
---|
689 | //! left child of each node. (right child==left child + 1, due to the alloc mechanism) |
---|
690 | //! child1_[node]==-1 if node is a leaf node |
---|
691 | thrust::device_vector<int>* child1_; |
---|
692 | //! parent node of each node |
---|
693 | thrust::device_vector<int>* parent_; |
---|
694 | //! split info (dim/value or left/right pointers) |
---|
695 | thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>* splits_; |
---|
696 | //! min aabb value of each node |
---|
697 | thrust::device_vector<float4>* aabb_min_; |
---|
698 | //! max aabb value of each node |
---|
699 | thrust::device_vector<float4>* aabb_max_; |
---|
700 | |
---|
701 | enum AllocationInfo |
---|
702 | { |
---|
703 | NodeCount=0, |
---|
704 | NodesAllocated=1, |
---|
705 | OutOfSpace=2 |
---|
706 | }; |
---|
707 | // those were put into a single vector of 3 elements so that only one mem transfer will be needed for all three of them |
---|
708 | // thrust::device_vector<int> out_of_space_; |
---|
709 | // thrust::device_vector<int> node_count_; |
---|
710 | // thrust::device_vector<int> nodes_allocated_; |
---|
711 | thrust::device_vector<int> allocation_info_; |
---|
712 | |
---|
713 | int max_leaf_size_; |
---|
714 | |
---|
715 | // coordinate values of the points |
---|
716 | thrust::device_vector<float>* points_x_, * points_y_, * points_z_; |
---|
717 | // indices |
---|
718 | thrust::device_vector<int>* index_x_, * index_y_, * index_z_; |
---|
719 | // owner node |
---|
720 | thrust::device_vector<int>* owners_x_, * owners_y_, * owners_z_; |
---|
721 | // contains info about whether a point was partitioned to the left or right child after a split |
---|
722 | thrust::device_vector<int>* leftright_x_, * leftright_y_, * leftright_z_; |
---|
723 | thrust::device_vector<int>* tmp_index_, * tmp_owners_, * tmp_misc_; |
---|
724 | bool delete_node_info_; |
---|
725 | }; |
---|
726 | |
---|
727 | |
---|
728 | } // namespace flann |
---|
729 | #endif |
---|