Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/flann/include/flann/algorithms/kdtree_cuda_builder.h @ 15840

Last change on this file since 15840 was 15840, checked in by gkronber, 6 years ago

#2886 added utility console program for clustering of expressions

File size: 29.0 KB
Line 
1/***********************************************************************
2 * Software License Agreement (BSD License)
3 *
4 * Copyright 2011       Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved.
5 *
6 * THE BSD LICENSE
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *************************************************************************/
29
30#ifndef FLANN_CUDA_KD_TREE_BUILDER_H_
31#define FLANN_CUDA_KD_TREE_BUILDER_H_
32#include <thrust/host_vector.h>
33#include <thrust/device_vector.h>
34#include <thrust/sort.h>
35#include <thrust/partition.h>
36#include <thrust/unique.h>
37#include <thrust/scan.h>
38#include <flann/util/cutil_math.h>
39#include <stdlib.h>
40
41// #define PRINT_DEBUG_TIMING
42
43namespace flann
44{
45//      template< typename T >
46//      void print_vector(  const thrust::device_vector<T>& v )
47//      {
48//              for( int i=0; i< v.size(); i++ )
49//              {
50//                      std::cout<<v[i]<<std::endl;
51//              }
52//      }
53//
54//      template< typename T1, typename T2 >
55//      void print_vector(  const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2 )
56//      {
57//              for( int i=0; i< v1.size(); i++ )
58//              {
59//                      std::cout<<i<<": "<<v1[i]<<" "<<v2[i]<<std::endl;
60//              }
61//      }
62//
63//      template< typename T1, typename T2, typename T3 >
64//      void print_vector(  const thrust::device_vector<T1>& v1, const thrust::device_vector<T2>& v2, const thrust::device_vector<T3>& v3 )
65//      {
66//              for( int i=0; i< v1.size(); i++ )
67//              {
68//                      std::cout<<i<<": "<<v1[i]<<" "<<v2[i]<<" "<<v3[i]<<std::endl;
69//              }
70//      }
71//
72//      template< typename T >
73//      void print_vector_by_index(  const thrust::device_vector<T>& v,const thrust::device_vector<int>& ind )
74//      {
75//              for( int i=0; i< v.size(); i++ )
76//              {
77//                      std::cout<<v[ind[i]]<<std::endl;
78//              }
79//      }
80
81//      std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::SplitInfo& s) {
82//      stream<<"(split l/r: "<< s.left <<" "<< s.right<< "  split:"<<s.split_dim<<" "<<s.split_val<<")";
83//      return stream;
84// }
85//
86//
87// std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::NodeInfo& s) {
88//      stream<<"(node: "<<s.child1()<<" "<<s.parent()<<" "<<s.child2()<<")";
89//      return stream;
90// }
91//
92// std::ostream& operator <<(std::ostream& stream, const float4& s) {
93//      stream<<"("<<s.x<<","<<s.y<<","<<s.z<<","<<s.w<<")";
94//      return stream;
95// }
96namespace cuda
97{
98namespace kd_tree_builder_detail
99{
100//! normal node: contains the split dimension and value
101//! leaf node: left == index of first points, right==index of last point +1
102struct SplitInfo
103{
104    union {
105        struct
106        {
107            // begin of child nodes
108            int left;
109            // end of child nodes
110            int right;
111        };
112        struct
113        {
114            int split_dim;
115            float split_val;
116        };
117    };
118
119};
120
121struct IsEven
122{
123    typedef int result_type;
124    __device__
125    int operator()(int i )
126    {
127        return (i& 1)==0;
128    }
129};
130
131struct SecondElementIsEven
132{
133    __host__ __device__
134    bool operator()( const thrust::tuple<int,int>& i )
135    {
136        return (thrust::get<1>(i)& 1)==0;
137    }
138};
139
140//! just for convenience: access a float4 by an index in [0,1,2]
141//! (casting it to a float* and accessing it by the index is way slower...)
142__host__ __device__
143float get_value_by_index( const float4& f, int i )
144{
145    switch(i) {
146    case 0:
147        return f.x;
148    case 1:
149        return f.y;
150    default:
151        return f.z;
152    }
153
154}
155
156//! mark a point as belonging to the left or right child of its current parent
157//! called after parents are split
158struct MovePointsToChildNodes
159{
160    MovePointsToChildNodes( int* child1, SplitInfo* splits, float* x, float* y, float* z, int* ox, int* oy, int* oz, int* lrx, int* lry, int* lrz )
161        : child1_(child1), splits_(splits), x_(x), y_(y), z_(z), ox_(ox), oy_(oy), oz_(oz), lrx_(lrx), lry_(lry), lrz_(lrz){}
162    //  int dim;
163    //  float threshold;
164    int* child1_;
165    SplitInfo* splits_;
166
167    // coordinate values
168    float* x_, * y_, * z_;
169    // owner indices -> which node does the point belong to?
170    int* ox_, * oy_, * oz_;
171    // temp info: will be set to 1 of a point is moved to the right child node, 0 otherwise
172    // (used later in the scan op to separate the points of the children into continuous ranges)
173    int* lrx_, * lry_, * lrz_;
174    __device__
175    void operator()( const thrust::tuple<int, int, int, int>& data )
176    {
177        int index = thrust::get<0>(data);
178        int owner = ox_[index]; // before a split, all points at the same position in the index array have the same owner
179        int point_ind1=thrust::get<1>(data);
180        int point_ind2=thrust::get<2>(data);
181        int point_ind3=thrust::get<3>(data);
182        int leftChild=child1_[owner];
183        int split_dim;
184        float dim_val1, dim_val2, dim_val3;
185        SplitInfo split;
186        lrx_[index]=0;
187        lry_[index]=0;
188        lrz_[index]=0;
189        // this element already belongs to a leaf node -> everything alright, no need to change anything
190        if( leftChild==-1 ) {
191            return;
192        }
193        // otherwise: load split data, and assign this index to the new owner
194        split = splits_[owner];
195        split_dim=split.split_dim;
196        switch( split_dim ) {
197        case 0:
198            dim_val1=x_[point_ind1];
199            dim_val2=x_[point_ind2];
200            dim_val3=x_[point_ind3];
201            break;
202        case 1:
203            dim_val1=y_[point_ind1];
204            dim_val2=y_[point_ind2];
205            dim_val3=y_[point_ind3];
206            break;
207        default:
208            dim_val1=z_[point_ind1];
209            dim_val2=z_[point_ind2];
210            dim_val3=z_[point_ind3];
211            break;
212
213        }
214
215
216        int r1=leftChild +(dim_val1 > split.split_val);
217        ox_[index]=r1;
218        int r2=leftChild+(dim_val2 > split.split_val);
219        oy_[index]=r2;
220        oz_[index]=leftChild+(dim_val3 > split.split_val);
221
222        lrx_[index] = (dim_val1 > split.split_val);
223        lry_[index] = (dim_val2 > split.split_val);
224        lrz_[index] = (dim_val3 > split.split_val);
225        //                      return thrust::make_tuple( r1, r2, leftChild+(dim_val > split.split_val) );
226    }
227};
228
229//! used to update the left/right pointers and aabb infos after the node splits
230struct SetLeftAndRightAndAABB
231{
232    int maxPoints;
233    int nElements;
234
235    SplitInfo* nodes;
236    int* counts;
237    int* labels;
238    float4* aabbMin;
239    float4* aabbMax;
240    const float* x,* y,* z;
241    const int* ix, * iy, * iz;
242
243    __host__ __device__
244    void operator()( int i )
245    {
246        int index=labels[i];
247        int right;
248        int left = counts[i];
249        nodes[index].left=left;
250        if( i < nElements-1 ) {
251            right=counts[i+1];
252        }
253        else { // index==nNodes
254            right=maxPoints;
255        }
256        nodes[index].right=right;
257        aabbMin[index].x=x[ix[left]];
258        aabbMin[index].y=y[iy[left]];
259        aabbMin[index].z=z[iz[left]];
260        aabbMax[index].x=x[ix[right-1]];
261        aabbMax[index].y=y[iy[right-1]];
262        aabbMax[index].z=z[iz[right-1]];
263    }
264};
265
266
267//! - decide whether a node has to be split
268//! if yes:
269//! - allocate child nodes
270//! - set split axis as axis of maximum aabb length
271struct SplitNodes
272{
273    int maxPointsPerNode;
274    int* node_count;
275    int* nodes_allocated;
276    int* out_of_space;
277    int* child1_;
278    int* parent_;
279    SplitInfo* splits;
280
281    __device__
282    void operator()( thrust::tuple<int&, int&,SplitInfo&,float4&,float4&, int> node ) // float4: aabbMin, aabbMax
283    {
284        int& parent=thrust::get<0>(node);
285        int& child1=thrust::get<1>(node);
286        SplitInfo& s=thrust::get<2>(node);
287        const float4& aabbMin=thrust::get<3>(node);
288        const float4& aabbMax=thrust::get<4>(node);
289        int my_index = thrust::get<5>(node);
290        bool split_node=false;
291        // first, each thread block counts the number of nodes that it needs to allocate...
292        __shared__ int block_nodes_to_allocate;
293        if( threadIdx.x== 0 ) block_nodes_to_allocate=0;
294        __syncthreads();
295
296        // don't split if all points are equal
297        // (could lead to an infinite loop, and doesn't make any sense anyway)
298        bool all_points_in_node_are_equal=aabbMin.x == aabbMax.x && aabbMin.y==aabbMax.y && aabbMin.z==aabbMax.z;
299
300        int offset_to_global=0;
301
302        // maybe this could be replaced with a reduction...
303        if(( child1==-1) &&( s.right-s.left > maxPointsPerNode) && !all_points_in_node_are_equal ) { // leaf node
304            split_node=true;
305            offset_to_global = atomicAdd( &block_nodes_to_allocate,2 );
306        }
307
308        __syncthreads();
309        __shared__ int block_left;
310        __shared__ bool enough_space;
311        // ... then the first thread tries to allocate this many nodes...
312        if( threadIdx.x==0) {
313            block_left = atomicAdd( node_count, block_nodes_to_allocate );
314            enough_space = block_left+block_nodes_to_allocate < *nodes_allocated;
315            // if it doesn't succeed, no nodes will be created by this block
316            if( !enough_space ) {
317                atomicAdd( node_count, -block_nodes_to_allocate );
318                *out_of_space=1;
319            }
320        }
321
322        __syncthreads();
323        // this thread needs to split it's node && there was enough space for all the nodes
324        // in this block.
325        //(The whole "allocate-per-block-thing" is much faster than letting each element allocate
326        // its space on its own, because shared memory atomics are A LOT faster than
327        // global mem atomics!)
328        if( split_node && enough_space ) {
329            int left = block_left + offset_to_global;
330
331            splits[left].left=s.left;
332            splits[left].right=s.right;
333            splits[left+1].left=0;
334            splits[left+1].right=0;
335
336            // split axis/position: middle of longest aabb extent
337            float4 aabbDim=aabbMax-aabbMin;
338            int maxDim=0;
339            float maxDimLength=aabbDim.x;
340            float4 splitVal=(aabbMax+aabbMin);
341            splitVal*=0.5f;
342            for( int i=1; i<=2; i++ ) {
343                float val = get_value_by_index(aabbDim,i);
344                if( val > maxDimLength ) {
345                    maxDim=i;
346                    maxDimLength=val;
347                }
348            }
349            s.split_dim=maxDim;
350            s.split_val=get_value_by_index(splitVal,maxDim);
351
352            child1_[my_index]=left;
353            splits[my_index]=s;
354
355            parent_[left]=my_index;
356            parent_[left+1]=my_index;
357            child1_[left]=-1;
358            child1_[left+1]=-1;
359        }
360    }
361};
362
363
364//! computes the scatter target address for the split operation, see Sengupta,Harris,Zhang,Owen: Scan Primitives for GPU Computing
365//! in my use case, this is about 2x as fast as thrust::partition
366struct set_addr3
367{
368    const int* val_, * f_;
369
370    int npoints_;
371    __device__
372    int operator()( int id )
373    {
374        int nf = f_[npoints_-1] + (val_[npoints_-1]);
375        int f=f_[id];
376        int t = id -f+nf;
377        return val_[id] ? f : t;
378    }
379};
380
381//! converts a float4 point (xyz) to a tuple of three float vals (used to separate the
382//! float4 input buffer into three arrays in the beginning of the tree build)
383struct pointxyz_to_px_py_pz
384{
385    __device__
386    thrust::tuple<float,float,float> operator()( const float4& val )
387    {
388        return thrust::make_tuple(val.x, val.y, val.z);
389    }
390};
391} // namespace kd_tree_builder_detail
392
393} // namespace cuda
394
395
396std::ostream& operator <<(std::ostream& stream, const cuda::kd_tree_builder_detail::SplitInfo& s)
397{
398    stream<<"(split l/r: "<< s.left <<" "<< s.right<< "  split:"<<s.split_dim<<" "<<s.split_val<<")";
399    return stream;
400}
401class CudaKdTreeBuilder
402{
403public:
404    CudaKdTreeBuilder( const thrust::device_vector<float4>& points, int max_leaf_size ) : /*out_of_space_(1,0),node_count_(1,1),*/ max_leaf_size_(max_leaf_size)
405    {
406        points_=&points;
407        int prealloc = points.size()/max_leaf_size_*16;
408        allocation_info_.resize(3);
409        allocation_info_[NodeCount]=1;
410        allocation_info_[NodesAllocated]=prealloc;
411        allocation_info_[OutOfSpace]=0;
412
413        //              std::cout<<points_->size()<<std::endl;
414
415        child1_=new thrust::device_vector<int>(prealloc,-1);
416        parent_=new thrust::device_vector<int>(prealloc,-1);
417        cuda::kd_tree_builder_detail::SplitInfo s;
418        s.left=0;
419        s.right=0;
420        splits_=new thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>(prealloc,s);
421        s.right=points.size();
422        (*splits_)[0]=s;
423
424        aabb_min_=new thrust::device_vector<float4>(prealloc);
425        aabb_max_=new thrust::device_vector<float4>(prealloc);
426
427        index_x_=new thrust::device_vector<int>(points_->size());
428        index_y_=new thrust::device_vector<int>(points_->size());
429        index_z_=new thrust::device_vector<int>(points_->size());
430
431        owners_x_=new thrust::device_vector<int>(points_->size(),0);
432        owners_y_=new thrust::device_vector<int>(points_->size(),0);
433        owners_z_=new thrust::device_vector<int>(points_->size(),0);
434
435        leftright_x_ = new thrust::device_vector<int>(points_->size(),0);
436        leftright_y_ = new thrust::device_vector<int>(points_->size(),0);
437        leftright_z_ = new thrust::device_vector<int>(points_->size(),0);
438
439        tmp_index_=new thrust::device_vector<int>(points_->size());
440        tmp_owners_=new thrust::device_vector<int>(points_->size());
441        tmp_misc_=new thrust::device_vector<int>(points_->size());
442
443        points_x_=new thrust::device_vector<float>(points_->size());
444        points_y_=new thrust::device_vector<float>(points_->size());
445        points_z_=new thrust::device_vector<float>(points_->size());
446        delete_node_info_=false;
447    }
448
449    ~CudaKdTreeBuilder()
450    {
451        if( delete_node_info_ ) {
452            delete child1_;
453            delete parent_;
454            delete splits_;
455            delete aabb_min_;
456            delete aabb_max_;
457            delete index_x_;
458        }
459
460        delete index_y_;
461        delete index_z_;
462        delete owners_x_;
463        delete owners_y_;
464        delete owners_z_;
465        delete points_x_;
466        delete points_y_;
467        delete points_z_;
468        delete leftright_x_;
469        delete leftright_y_;
470        delete leftright_z_;
471        delete tmp_index_;
472        delete tmp_owners_;
473        delete tmp_misc_;
474    }
475
476  //! build the tree
477  //! general idea:
478  //! - build sorted lists of the points in x y and z order (to be able to compute tight AABBs in O(1) )
479  //! - while( nodes to split exist )
480  //!    - split non-child nodes along longest axis if the number of points is > max_points_per_node
481  //!    - for each point: determine whether it is in a node that was split. If yes, mark it as belonging to the left or right child node of its current parent node
482  //!    - reorder the points so that the points of a single node are continuous in the node array
483  //!    - update the left/right pointers and AABBs of all nodes
484    void buildTree()
485    {
486        //              std::cout<<"buildTree()"<<std::endl;
487        //              sleep(1);
488        //              Util::Timer stepTimer;
489        thrust::transform( points_->begin(), points_->end(), thrust::make_zip_iterator(thrust::make_tuple(points_x_->begin(), points_y_->begin(),points_z_->begin()) ), cuda::kd_tree_builder_detail::pointxyz_to_px_py_pz() );
490
491        thrust::counting_iterator<int> it(0);
492        thrust::copy( it, it+points_->size(), index_x_->begin() );
493
494        thrust::copy( index_x_->begin(), index_x_->end(), index_y_->begin() );
495        thrust::copy( index_x_->begin(), index_x_->end(), index_z_->begin() );
496
497        thrust::device_vector<float> tmpv(points_->size());
498
499        // create sorted index list -> can be used to compute AABBs in O(1)
500        thrust::copy(points_x_->begin(), points_x_->end(), tmpv.begin());
501        thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_x_->begin() );
502        thrust::copy(points_y_->begin(), points_y_->end(), tmpv.begin());
503        thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_y_->begin() );
504        thrust::copy(points_z_->begin(), points_z_->end(), tmpv.begin());
505        thrust::sort_by_key( tmpv.begin(), tmpv.end(), index_z_->begin() );
506
507
508        (*aabb_min_)[0]=make_float4((*points_x_)[(*index_x_)[0]],(*points_y_)[(*index_y_)[0]],(*points_z_)[(*index_z_)[0]],0);
509
510        (*aabb_max_)[0]=make_float4((*points_x_)[(*index_x_)[points_->size()-1]],(*points_y_)[(*index_y_)[points_->size()-1]],(*points_z_)[(*index_z_)[points_->size()-1]],0);
511        #ifdef PRINT_DEBUG_TIMING
512        cudaDeviceSynchronize();
513        std::cout<<" initial stuff:"<<stepTimer.elapsed()<<std::endl;
514        stepTimer.restart();
515        #endif
516        int last_node_count=0;
517        for( int i=0;; i++ ) {
518            cuda::kd_tree_builder_detail::SplitNodes sn;
519
520            sn.maxPointsPerNode=max_leaf_size_;
521            sn.node_count=thrust::raw_pointer_cast(&allocation_info_[NodeCount]);
522            sn.nodes_allocated=thrust::raw_pointer_cast(&allocation_info_[NodesAllocated]);
523            sn.out_of_space=thrust::raw_pointer_cast(&allocation_info_[OutOfSpace]);
524            sn.child1_=thrust::raw_pointer_cast(&(*child1_)[0]);
525            sn.parent_=thrust::raw_pointer_cast(&(*parent_)[0]);
526            sn.splits=thrust::raw_pointer_cast(&(*splits_)[0]);
527
528            thrust::counting_iterator<int> cit(0);
529            thrust::for_each( thrust::make_zip_iterator(thrust::make_tuple( parent_->begin(), child1_->begin(),  splits_->begin(), aabb_min_->begin(), aabb_max_->begin(), cit  )),
530                              thrust::make_zip_iterator(thrust::make_tuple( parent_->begin()+last_node_count, child1_->begin()+last_node_count,splits_->begin()+last_node_count, aabb_min_->begin()+last_node_count, aabb_max_->begin()+last_node_count,cit+last_node_count  )),
531                              sn   );
532            // copy allocation info to host
533            thrust::host_vector<int> alloc_info = allocation_info_;
534
535            if( last_node_count == alloc_info[NodeCount] ) { // no more nodes were split -> done
536                break;
537            }
538            last_node_count=alloc_info[NodeCount];
539     
540      // a node was un-splittable due to a lack of space
541            if( alloc_info[OutOfSpace]==1 ) {
542                resize_node_vectors(alloc_info[NodesAllocated]*2);
543                alloc_info[OutOfSpace]=0;
544                alloc_info[NodesAllocated]*=2;
545                allocation_info_=alloc_info;
546            }
547            #ifdef PRINT_DEBUG_TIMING
548            cudaDeviceSynchronize();
549            std::cout<<" node split:"<<stepTimer.elapsed()<<std::endl;
550            stepTimer.restart();
551            #endif
552
553            // foreach point: point was in node that was split?move it to child (leaf) node : do nothing
554            cuda::kd_tree_builder_detail::MovePointsToChildNodes sno( thrust::raw_pointer_cast(&(*child1_)[0]),
555                                                                      thrust::raw_pointer_cast(&(*splits_)[0]),
556                                                                      thrust::raw_pointer_cast(&(*points_x_)[0]),
557                                                                      thrust::raw_pointer_cast(&(*points_y_)[0]),
558                                                                      thrust::raw_pointer_cast(&(*points_z_)[0]),
559                                                                      thrust::raw_pointer_cast(&(*owners_x_)[0]),
560                                                                      thrust::raw_pointer_cast(&(*owners_y_)[0]),
561                                                                      thrust::raw_pointer_cast(&(*owners_z_)[0]),
562                                                                      thrust::raw_pointer_cast(&(*leftright_x_)[0]),
563                                                                      thrust::raw_pointer_cast(&(*leftright_y_)[0]),
564                                                                      thrust::raw_pointer_cast(&(*leftright_z_)[0])
565                                                                      );
566            thrust::counting_iterator<int> ci0(0);
567            thrust::for_each( thrust::make_zip_iterator( thrust::make_tuple( ci0, index_x_->begin(), index_y_->begin(), index_z_->begin()) ),
568                              thrust::make_zip_iterator( thrust::make_tuple( ci0+points_->size(), index_x_->end(), index_y_->end(), index_z_->end()) ),sno  );
569
570            #ifdef PRINT_DEBUG_TIMING
571            cudaDeviceSynchronize();
572            std::cout<<" set new owners:"<<stepTimer.elapsed()<<std::endl;
573            stepTimer.restart();
574            #endif
575
576            // move points around so that each leaf node's points are continuous
577            separate_left_and_right_children(*index_x_,*owners_x_,*tmp_index_,*tmp_owners_, *leftright_x_);
578            std::swap(tmp_index_, index_x_);
579            std::swap(tmp_owners_, owners_x_);
580            separate_left_and_right_children(*index_y_,*owners_y_,*tmp_index_,*tmp_owners_, *leftright_y_,false);
581            std::swap(tmp_index_, index_y_);
582            separate_left_and_right_children(*index_z_,*owners_z_,*tmp_index_,*tmp_owners_, *leftright_z_,false);
583            std::swap(tmp_index_, index_z_);
584
585            #ifdef PRINT_DEBUG_TIMING
586            cudaDeviceSynchronize();
587            std::cout<<" split:"<<stepTimer.elapsed()<<std::endl;
588            stepTimer.restart();
589            #endif
590            // calculate new AABB etc
591            update_leftright_and_aabb( *points_x_, *points_y_, *points_z_, *index_x_, *index_y_, *index_z_, *owners_x_, *splits_,*aabb_min_, *aabb_max_);
592            #ifdef PRINT_DEBUG_TIMING
593            cudaDeviceSynchronize();
594            std::cout<<" update_leftright_and_aabb:"<<stepTimer.elapsed()<<std::endl;
595            stepTimer.restart();
596            print_vector(node_count_);
597            #endif
598
599        }
600    }
601   
602  template<class Distance>
603  friend class KDTreeCuda3dIndex;
604
605protected:
606
607
608    //! takes the partitioned nodes, and sets the left-/right info of leaf nodes, as well as the AABBs
609    void
610    update_leftright_and_aabb( const thrust::device_vector<float>& x, const thrust::device_vector<float>& y,const thrust::device_vector<float>& z,
611                               const thrust::device_vector<int>& ix, const thrust::device_vector<int>& iy,const thrust::device_vector<int>& iz,
612                               const thrust::device_vector<int>& owners,
613                               thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>& splits, thrust::device_vector<float4>& aabbMin,thrust::device_vector<float4>& aabbMax)
614    {
615        thrust::device_vector<int>* labelsUnique=tmp_owners_;
616        thrust::device_vector<int>* countsUnique=tmp_index_;
617    // assume: points of each node are continuous in the array
618   
619    // find which nodes are here, and where each node's points begin and end
620        int unique_labels = thrust::unique_by_key_copy( owners.begin(), owners.end(), thrust::counting_iterator<int>(0), labelsUnique->begin(), countsUnique->begin()).first - labelsUnique->begin();
621
622    // update the info
623        cuda::kd_tree_builder_detail::SetLeftAndRightAndAABB s;
624        s.maxPoints=x.size();
625        s.nElements=unique_labels;
626        s.nodes=thrust::raw_pointer_cast(&(splits[0]));
627        s.counts=thrust::raw_pointer_cast(&( (*countsUnique)[0]));
628        s.labels=thrust::raw_pointer_cast(&( (*labelsUnique)[0]));
629        s.x=thrust::raw_pointer_cast(&x[0]);
630        s.y=thrust::raw_pointer_cast(&y[0]);
631        s.z=thrust::raw_pointer_cast(&z[0]);
632        s.ix=thrust::raw_pointer_cast(&ix[0]);
633        s.iy=thrust::raw_pointer_cast(&iy[0]);
634        s.iz=thrust::raw_pointer_cast(&iz[0]);
635        s.aabbMin=thrust::raw_pointer_cast(&aabbMin[0]);
636        s.aabbMax=thrust::raw_pointer_cast(&aabbMax[0]);
637
638        thrust::counting_iterator<int> it(0);
639        thrust::for_each(it, it+unique_labels, s);
640    }
641
642    //! Separates the left and right children of each node into continuous parts of the array.
643    //! More specifically, it seperates children with even and odd node indices because nodes are always
644    //! allocated in pairs -> child1==child2+1 -> child1 even and child2 odd, or vice-versa.
645    //! Since the split operation is stable, this results in continuous partitions
646    //! for all the single nodes.
647    //! (basically the split primitive according to sengupta et al)
648    //! about twice as fast as thrust::partition
649    void separate_left_and_right_children( thrust::device_vector<int>& key_in, thrust::device_vector<int>& val_in, thrust::device_vector<int>& key_out, thrust::device_vector<int>& val_out, thrust::device_vector<int>& left_right_marks, bool scatter_val_out=true )
650    {
651        thrust::device_vector<int>* f_tmp = &val_out;
652        thrust::device_vector<int>* addr_tmp = tmp_misc_;
653
654        thrust::exclusive_scan( /*thrust::make_transform_iterator(*/ left_right_marks.begin() /*,cuda::kd_tree_builder_detail::IsEven*/
655                                                                     /*())*/, /*thrust::make_transform_iterator(*/ left_right_marks.end() /*,cuda::kd_tree_builder_detail::IsEven*/
656                                                                     /*())*/,     f_tmp->begin() );
657        cuda::kd_tree_builder_detail::set_addr3 sa;
658        sa.val_=thrust::raw_pointer_cast(&left_right_marks[0]);
659        sa.f_=thrust::raw_pointer_cast(&(*f_tmp)[0]);
660        sa.npoints_=key_in.size();
661        thrust::counting_iterator<int> it(0);
662        thrust::transform(it, it+val_in.size(), addr_tmp->begin(), sa);
663
664        thrust::scatter(key_in.begin(), key_in.end(), addr_tmp->begin(), key_out.begin());
665        if( scatter_val_out ) thrust::scatter(val_in.begin(), val_in.end(), addr_tmp->begin(), val_out.begin());
666    }
667
668    //! allocates additional space in all the node-related vectors.
669    //! new_size elements will be added to all vectors.
670    void resize_node_vectors( size_t new_size )
671    {
672        size_t add = new_size - child1_->size();
673        child1_->insert(child1_->end(), add, -1);
674        parent_->insert(parent_->end(), add, -1);
675        cuda::kd_tree_builder_detail::SplitInfo s;
676        s.left=0;
677        s.right=0;
678        splits_->insert(splits_->end(), add, s);
679        float4 f;
680        aabb_min_->insert(aabb_min_->end(), add, f);
681        aabb_max_->insert(aabb_max_->end(), add, f);
682    }
683
684
685    const thrust::device_vector<float4>* points_;
686 
687  // tree data, those are stored per-node
688 
689  //! left child of each node. (right child==left child + 1, due to the alloc mechanism)
690  //! child1_[node]==-1 if node is a leaf node
691    thrust::device_vector<int>* child1_;
692  //! parent node of each node
693    thrust::device_vector<int>* parent_;
694  //! split info (dim/value or left/right pointers)
695    thrust::device_vector<cuda::kd_tree_builder_detail::SplitInfo>* splits_;
696  //! min aabb value of each node
697    thrust::device_vector<float4>* aabb_min_;
698  //! max aabb value of each node
699    thrust::device_vector<float4>* aabb_max_;
700
701    enum AllocationInfo
702    {
703        NodeCount=0,
704        NodesAllocated=1,
705        OutOfSpace=2
706    };
707    // those were put into a single vector of 3 elements so that only one mem transfer will be needed for all three of them
708    //  thrust::device_vector<int> out_of_space_;
709    //  thrust::device_vector<int> node_count_;
710    //  thrust::device_vector<int> nodes_allocated_;
711    thrust::device_vector<int> allocation_info_;
712 
713    int max_leaf_size_;
714
715  // coordinate values of the points
716    thrust::device_vector<float>* points_x_, * points_y_, * points_z_;
717  // indices
718    thrust::device_vector<int>* index_x_,  * index_y_,  * index_z_;
719  // owner node
720    thrust::device_vector<int>* owners_x_, * owners_y_, * owners_z_;
721  // contains info about whether a point was partitioned to the left or right child after a split
722    thrust::device_vector<int>* leftright_x_, * leftright_y_, * leftright_z_;
723    thrust::device_vector<int>* tmp_index_, * tmp_owners_, * tmp_misc_;
724    bool delete_node_info_;
725};
726
727
728} // namespace flann
729#endif
Note: See TracBrowser for help on using the repository browser.