1 | /********************************************************************** |
---|
2 | * Software License Agreement (BSD License) |
---|
3 | * |
---|
4 | * Copyright 2011 Andreas Muetzel (amuetzel@uni-koblenz.de). All rights reserved. |
---|
5 | * |
---|
6 | * THE BSD LICENSE |
---|
7 | * |
---|
8 | * Redistribution and use in source and binary forms, with or without |
---|
9 | * modification, are permitted provided that the following conditions |
---|
10 | * are met: |
---|
11 | * |
---|
12 | * 1. Redistributions of source code must retain the above copyright |
---|
13 | * notice, this list of conditions and the following disclaimer. |
---|
14 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
15 | * notice, this list of conditions and the following disclaimer in the |
---|
16 | * documentation and/or other materials provided with the distribution. |
---|
17 | * |
---|
18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
---|
19 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
---|
20 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
---|
21 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
---|
22 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
---|
23 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
---|
24 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
---|
25 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
---|
26 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
---|
27 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
28 | *************************************************************************/ |
---|
29 | #ifndef FLANN_UTIL_CUDA_RESULTSET_H |
---|
30 | #define FLANN_UTIL_CUDA_RESULTSET_H |
---|
31 | |
---|
32 | #include <flann/util/cuda/heap.h> |
---|
33 | #include <limits> |
---|
34 | |
---|
35 | __device__ __forceinline__ |
---|
36 | float infinity() |
---|
37 | { |
---|
38 | return __int_as_float(0x7f800000); |
---|
39 | } |
---|
40 | |
---|
41 | #ifndef INFINITY |
---|
42 | #define INFINITY infinity() |
---|
43 | #endif |
---|
44 | |
---|
45 | namespace flann |
---|
46 | { |
---|
47 | namespace cuda |
---|
48 | { |
---|
49 | //! result set for the 1nn search. Doesn't do any global memory accesses on its own, |
---|
50 | template< typename DistanceType > |
---|
51 | struct SingleResultSet |
---|
52 | { |
---|
53 | int bestIndex; |
---|
54 | DistanceType bestDist; |
---|
55 | const DistanceType epsError; |
---|
56 | |
---|
57 | __device__ |
---|
58 | SingleResultSet( DistanceType eps ) : bestIndex(-1),bestDist(INFINITY), epsError(eps){ } |
---|
59 | |
---|
60 | __device__ |
---|
61 | inline float |
---|
62 | worstDist() |
---|
63 | { |
---|
64 | return bestDist; |
---|
65 | } |
---|
66 | |
---|
67 | __device__ |
---|
68 | inline void |
---|
69 | insert(int index, DistanceType dist) |
---|
70 | { |
---|
71 | if( dist <= bestDist ) { |
---|
72 | bestIndex=index; |
---|
73 | bestDist=dist; |
---|
74 | } |
---|
75 | } |
---|
76 | |
---|
77 | DistanceType* resultDist; |
---|
78 | int* resultIndex; |
---|
79 | |
---|
80 | __device__ |
---|
81 | inline void |
---|
82 | setResultLocation( DistanceType* dists, int* index, int thread, int stride ) |
---|
83 | { |
---|
84 | resultDist=dists+thread*stride; |
---|
85 | resultIndex=index+thread*stride; |
---|
86 | if( stride != 1 ) { |
---|
87 | for( int i=1; i<stride; i++ ) { |
---|
88 | resultDist[i]=INFINITY; |
---|
89 | resultIndex[i]=-1; |
---|
90 | } |
---|
91 | } |
---|
92 | } |
---|
93 | |
---|
94 | __device__ |
---|
95 | inline void |
---|
96 | finish() |
---|
97 | { |
---|
98 | resultDist[0]=bestDist; |
---|
99 | resultIndex[0]=bestIndex; |
---|
100 | } |
---|
101 | }; |
---|
102 | |
---|
103 | template< typename DistanceType > |
---|
104 | struct GreaterThan |
---|
105 | { |
---|
106 | __device__ |
---|
107 | bool operator()(DistanceType a, DistanceType b) |
---|
108 | { |
---|
109 | return a>b; |
---|
110 | } |
---|
111 | }; |
---|
112 | |
---|
113 | |
---|
114 | // using this and the template uses 2 or 3 registers more than the direct implementation in the kNearestKernel, but |
---|
115 | // there is no speed difference. |
---|
116 | // Setting useHeap as a template parameter leads to a whole lot of things being |
---|
117 | // optimized away by nvcc. |
---|
118 | // Register counts are the same as when removing not-needed variables in explicit specializations |
---|
119 | // and the "if( useHeap )" branches are eliminated at compile time. |
---|
120 | // The downside of this: a bit more complex kernel launch code. |
---|
121 | template< typename DistanceType, bool useHeap > |
---|
122 | struct KnnResultSet |
---|
123 | { |
---|
124 | int foundNeighbors; |
---|
125 | DistanceType largestHeapDist; |
---|
126 | int maxDistIndex; |
---|
127 | const int k; |
---|
128 | const bool sorted; |
---|
129 | const DistanceType epsError; |
---|
130 | |
---|
131 | |
---|
132 | __device__ |
---|
133 | KnnResultSet(int knn, bool sortResults, DistanceType eps) : foundNeighbors(0),largestHeapDist(INFINITY),k(knn), sorted(sortResults), epsError(eps){ } |
---|
134 | |
---|
135 | // __host__ __device__ |
---|
136 | // KnnResultSet(const KnnResultSet& o):foundNeighbors(o.foundNeighbors),largestHeapDist(o.largestHeapDist),k(o.k){ } |
---|
137 | |
---|
138 | __device__ |
---|
139 | inline DistanceType |
---|
140 | worstDist() |
---|
141 | { |
---|
142 | return largestHeapDist; |
---|
143 | } |
---|
144 | |
---|
145 | __device__ |
---|
146 | inline void |
---|
147 | insert(int index, DistanceType dist) |
---|
148 | { |
---|
149 | if( foundNeighbors<k ) { |
---|
150 | resultDist[foundNeighbors]=dist; |
---|
151 | resultIndex[foundNeighbors]=index; |
---|
152 | if( foundNeighbors==k-1) { |
---|
153 | if( useHeap ) { |
---|
154 | flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>()); |
---|
155 | largestHeapDist=resultDist[0]; |
---|
156 | } |
---|
157 | else { |
---|
158 | findLargestDistIndex(); |
---|
159 | } |
---|
160 | |
---|
161 | } |
---|
162 | foundNeighbors++; |
---|
163 | } |
---|
164 | else if( dist < largestHeapDist ) { |
---|
165 | if( useHeap ) { |
---|
166 | resultDist[0]=dist; |
---|
167 | resultIndex[0]=index; |
---|
168 | flann::cuda::heap::sift_down(resultDist,resultIndex,0,k,GreaterThan<DistanceType>()); |
---|
169 | largestHeapDist=resultDist[0]; |
---|
170 | } |
---|
171 | else { |
---|
172 | resultDist[maxDistIndex]=dist; |
---|
173 | resultIndex[maxDistIndex]=index; |
---|
174 | findLargestDistIndex(); |
---|
175 | } |
---|
176 | |
---|
177 | } |
---|
178 | } |
---|
179 | |
---|
180 | __device__ |
---|
181 | void |
---|
182 | findLargestDistIndex( ) |
---|
183 | { |
---|
184 | largestHeapDist=resultDist[0]; |
---|
185 | maxDistIndex=0; |
---|
186 | for( int i=1; i<k; i++ ) |
---|
187 | if( resultDist[i] > largestHeapDist ) { |
---|
188 | maxDistIndex=i; |
---|
189 | largestHeapDist=resultDist[i]; |
---|
190 | } |
---|
191 | } |
---|
192 | |
---|
193 | float* resultDist; |
---|
194 | int* resultIndex; |
---|
195 | |
---|
196 | __device__ |
---|
197 | inline void |
---|
198 | setResultLocation( DistanceType* dists, int* index, int thread, int stride ) |
---|
199 | { |
---|
200 | resultDist=dists+stride*thread; |
---|
201 | resultIndex=index+stride*thread; |
---|
202 | for( int i=0; i<stride; i++ ) { |
---|
203 | resultDist[i]=INFINITY; |
---|
204 | resultIndex[i]=-1; |
---|
205 | // resultIndex[tid+i*blockDim.x]=-1; |
---|
206 | // resultDist[tid+i*blockDim.x]=INFINITY; |
---|
207 | } |
---|
208 | } |
---|
209 | |
---|
210 | __host__ __device__ |
---|
211 | inline void |
---|
212 | finish() |
---|
213 | { |
---|
214 | if( sorted ) { |
---|
215 | if( !useHeap ) flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>()); |
---|
216 | for( int i=k-1; i>0; i-- ) { |
---|
217 | flann::cuda::swap( resultDist[0], resultDist[i] ); |
---|
218 | flann::cuda::swap( resultIndex[0], resultIndex[i] ); |
---|
219 | flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan<DistanceType>() ); |
---|
220 | } |
---|
221 | } |
---|
222 | } |
---|
223 | }; |
---|
224 | |
---|
225 | template <typename DistanceType> |
---|
226 | struct CountingRadiusResultSet |
---|
227 | { |
---|
228 | int count_; |
---|
229 | DistanceType radius_sq_; |
---|
230 | int max_neighbors_; |
---|
231 | |
---|
232 | __device__ |
---|
233 | CountingRadiusResultSet(DistanceType radius, int max_neighbors) : count_(0),radius_sq_(radius), max_neighbors_(max_neighbors){ } |
---|
234 | |
---|
235 | __device__ |
---|
236 | inline DistanceType |
---|
237 | worstDist() |
---|
238 | { |
---|
239 | return radius_sq_; |
---|
240 | } |
---|
241 | |
---|
242 | __device__ |
---|
243 | inline void |
---|
244 | insert(int index, float dist) |
---|
245 | { |
---|
246 | if( dist < radius_sq_ ) { |
---|
247 | count_++; |
---|
248 | } |
---|
249 | } |
---|
250 | |
---|
251 | int* resultIndex; |
---|
252 | |
---|
253 | __device__ |
---|
254 | inline void |
---|
255 | setResultLocation( DistanceType* /*dists*/, int* count, int thread, int stride ) |
---|
256 | { |
---|
257 | resultIndex=count+thread*stride; |
---|
258 | } |
---|
259 | |
---|
260 | __device__ |
---|
261 | inline void |
---|
262 | finish() |
---|
263 | { |
---|
264 | if(( max_neighbors_<=0) ||( count_<=max_neighbors_) ) resultIndex[0]=count_; |
---|
265 | else resultIndex[0]=max_neighbors_; |
---|
266 | } |
---|
267 | }; |
---|
268 | |
---|
269 | template<typename DistanceType, bool useHeap> |
---|
270 | struct RadiusKnnResultSet |
---|
271 | { |
---|
272 | int foundNeighbors; |
---|
273 | DistanceType largestHeapDist; |
---|
274 | int maxDistElem; |
---|
275 | const int k; |
---|
276 | const bool sorted; |
---|
277 | const DistanceType radius_sq_; |
---|
278 | int* segment_starts_; |
---|
279 | // int count_; |
---|
280 | |
---|
281 | |
---|
282 | __device__ |
---|
283 | RadiusKnnResultSet(DistanceType radius, int knn, int* segment_starts, bool sortResults) : foundNeighbors(0),largestHeapDist(radius),k(knn), sorted(sortResults), radius_sq_(radius),segment_starts_(segment_starts) { } |
---|
284 | |
---|
285 | // __host__ __device__ |
---|
286 | // KnnResultSet(const KnnResultSet& o):foundNeighbors(o.foundNeighbors),largestHeapDist(o.largestHeapDist),k(o.k){ } |
---|
287 | |
---|
288 | __device__ |
---|
289 | inline DistanceType |
---|
290 | worstDist() |
---|
291 | { |
---|
292 | return largestHeapDist; |
---|
293 | } |
---|
294 | |
---|
295 | __device__ |
---|
296 | inline void |
---|
297 | insert(int index, DistanceType dist) |
---|
298 | { |
---|
299 | if( dist < radius_sq_ ) { |
---|
300 | if( foundNeighbors<k ) { |
---|
301 | resultDist[foundNeighbors]=dist; |
---|
302 | resultIndex[foundNeighbors]=index; |
---|
303 | if(( foundNeighbors==k-1) && useHeap) { |
---|
304 | if( useHeap ) { |
---|
305 | flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>()); |
---|
306 | largestHeapDist=resultDist[0]; |
---|
307 | } |
---|
308 | else { |
---|
309 | findLargestDistIndex(); |
---|
310 | } |
---|
311 | } |
---|
312 | foundNeighbors++; |
---|
313 | |
---|
314 | } |
---|
315 | else if( dist < largestHeapDist ) { |
---|
316 | if( useHeap ) { |
---|
317 | resultDist[0]=dist; |
---|
318 | resultIndex[0]=index; |
---|
319 | flann::cuda::heap::sift_down(resultDist,resultIndex,0,k,GreaterThan<DistanceType>()); |
---|
320 | largestHeapDist=resultDist[0]; |
---|
321 | } |
---|
322 | else { |
---|
323 | resultDist[maxDistElem]=dist; |
---|
324 | resultIndex[maxDistElem]=index; |
---|
325 | findLargestDistIndex(); |
---|
326 | } |
---|
327 | } |
---|
328 | } |
---|
329 | } |
---|
330 | |
---|
331 | __device__ |
---|
332 | void |
---|
333 | findLargestDistIndex( ) |
---|
334 | { |
---|
335 | largestHeapDist=resultDist[0]; |
---|
336 | maxDistElem=0; |
---|
337 | for( int i=1; i<k; i++ ) |
---|
338 | if( resultDist[i] > largestHeapDist ) { |
---|
339 | maxDistElem=i; |
---|
340 | largestHeapDist=resultDist[i]; |
---|
341 | } |
---|
342 | } |
---|
343 | |
---|
344 | |
---|
345 | DistanceType* resultDist; |
---|
346 | int* resultIndex; |
---|
347 | |
---|
348 | __device__ |
---|
349 | inline void |
---|
350 | setResultLocation( DistanceType* dists, int* index, int thread, int /*stride*/ ) |
---|
351 | { |
---|
352 | resultDist=dists+segment_starts_[thread]; |
---|
353 | resultIndex=index+segment_starts_[thread]; |
---|
354 | } |
---|
355 | |
---|
356 | __device__ |
---|
357 | inline void |
---|
358 | finish() |
---|
359 | { |
---|
360 | if( sorted ) { |
---|
361 | if( !useHeap ) flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>()); |
---|
362 | for( int i=foundNeighbors-1; i>0; i-- ) { |
---|
363 | flann::cuda::swap( resultDist[0], resultDist[i] ); |
---|
364 | flann::cuda::swap( resultIndex[0], resultIndex[i] ); |
---|
365 | flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan<DistanceType>() ); |
---|
366 | } |
---|
367 | } |
---|
368 | } |
---|
369 | }; |
---|
370 | |
---|
371 | // Difference to RadiusKnnResultSet: Works like KnnResultSet, doesn't pack the results densely (as the RadiusResultSet does) |
---|
372 | template <typename DistanceType, bool useHeap> |
---|
373 | struct KnnRadiusResultSet |
---|
374 | { |
---|
375 | int foundNeighbors; |
---|
376 | DistanceType largestHeapDist; |
---|
377 | int maxDistIndex; |
---|
378 | const int k; |
---|
379 | const bool sorted; |
---|
380 | const DistanceType epsError; |
---|
381 | const DistanceType radius_sq; |
---|
382 | |
---|
383 | |
---|
384 | __device__ |
---|
385 | KnnRadiusResultSet(int knn, bool sortResults, DistanceType eps, DistanceType radius) : foundNeighbors(0),largestHeapDist(radius),k(knn), sorted(sortResults), epsError(eps),radius_sq(radius){ } |
---|
386 | |
---|
387 | // __host__ __device__ |
---|
388 | // KnnResultSet(const KnnResultSet& o):foundNeighbors(o.foundNeighbors),largestHeapDist(o.largestHeapDist),k(o.k){ } |
---|
389 | |
---|
390 | __device__ |
---|
391 | inline DistanceType |
---|
392 | worstDist() |
---|
393 | { |
---|
394 | return largestHeapDist; |
---|
395 | } |
---|
396 | |
---|
397 | __device__ |
---|
398 | inline void |
---|
399 | insert(int index, DistanceType dist) |
---|
400 | { |
---|
401 | if( dist < largestHeapDist ) { |
---|
402 | if( foundNeighbors<k ) { |
---|
403 | resultDist[foundNeighbors]=dist; |
---|
404 | resultIndex[foundNeighbors]=index; |
---|
405 | if( foundNeighbors==k-1 ) { |
---|
406 | if( useHeap ) { |
---|
407 | flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>()); |
---|
408 | largestHeapDist=resultDist[0]; |
---|
409 | } |
---|
410 | else { |
---|
411 | findLargestDistIndex(); |
---|
412 | } |
---|
413 | } |
---|
414 | foundNeighbors++; |
---|
415 | } |
---|
416 | else { //if( dist < largestHeapDist ) |
---|
417 | if( useHeap ) { |
---|
418 | resultDist[0]=dist; |
---|
419 | resultIndex[0]=index; |
---|
420 | flann::cuda::heap::sift_down(resultDist,resultIndex,0,k,GreaterThan<DistanceType>()); |
---|
421 | largestHeapDist=resultDist[0]; |
---|
422 | } |
---|
423 | else { |
---|
424 | resultDist[maxDistIndex]=dist; |
---|
425 | resultIndex[maxDistIndex]=index; |
---|
426 | findLargestDistIndex(); |
---|
427 | } |
---|
428 | } |
---|
429 | } |
---|
430 | } |
---|
431 | __device__ |
---|
432 | void |
---|
433 | findLargestDistIndex( ) |
---|
434 | { |
---|
435 | largestHeapDist=resultDist[0]; |
---|
436 | maxDistIndex=0; |
---|
437 | for( int i=1; i<k; i++ ) |
---|
438 | if( resultDist[i] > largestHeapDist ) { |
---|
439 | maxDistIndex=i; |
---|
440 | largestHeapDist=resultDist[i]; |
---|
441 | } |
---|
442 | } |
---|
443 | |
---|
444 | DistanceType* resultDist; |
---|
445 | int* resultIndex; |
---|
446 | |
---|
447 | __device__ |
---|
448 | inline void |
---|
449 | setResultLocation( DistanceType* dists, int* index, int thread, int stride ) |
---|
450 | { |
---|
451 | resultDist=dists+stride*thread; |
---|
452 | resultIndex=index+stride*thread; |
---|
453 | for( int i=0; i<stride; i++ ) { |
---|
454 | resultDist[i]=INFINITY; |
---|
455 | resultIndex[i]=-1; |
---|
456 | // resultIndex[tid+i*blockDim.x]=-1; |
---|
457 | // resultDist[tid+i*blockDim.x]=INFINITY; |
---|
458 | } |
---|
459 | } |
---|
460 | |
---|
461 | __device__ |
---|
462 | inline void |
---|
463 | finish() |
---|
464 | { |
---|
465 | if( sorted ) { |
---|
466 | if( !useHeap ) flann::cuda::heap::make_heap(resultDist,resultIndex,k,GreaterThan<DistanceType>()); |
---|
467 | for( int i=k-1; i>0; i-- ) { |
---|
468 | flann::cuda::swap( resultDist[0], resultDist[i] ); |
---|
469 | flann::cuda::swap( resultIndex[0], resultIndex[i] ); |
---|
470 | flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan<DistanceType>() ); |
---|
471 | } |
---|
472 | } |
---|
473 | } |
---|
474 | }; |
---|
475 | |
---|
476 | //! fills the radius output buffer. |
---|
477 | //! IMPORTANT ASSERTION: ASSUMES THAT THERE IS ENOUGH SPACE FOR EVERY NEIGHBOR! IF THIS ISN'T |
---|
478 | //! TRUE, USE KnnRadiusResultSet! (Otherwise, the neighbors of one element might overflow into the next element, or past the buffer.) |
---|
479 | template< typename DistanceType > |
---|
480 | struct RadiusResultSet |
---|
481 | { |
---|
482 | DistanceType radius_sq_; |
---|
483 | int* segment_starts_; |
---|
484 | int count_; |
---|
485 | bool sorted_; |
---|
486 | |
---|
487 | __device__ |
---|
488 | RadiusResultSet(DistanceType radius, int* segment_starts, bool sorted) : radius_sq_(radius), segment_starts_(segment_starts), count_(0), sorted_(sorted){ } |
---|
489 | |
---|
490 | __device__ |
---|
491 | inline DistanceType |
---|
492 | worstDist() |
---|
493 | { |
---|
494 | return radius_sq_; |
---|
495 | } |
---|
496 | |
---|
497 | __device__ |
---|
498 | inline void |
---|
499 | insert(int index, DistanceType dist) |
---|
500 | { |
---|
501 | if( dist < radius_sq_ ) { |
---|
502 | resultIndex[count_]=index; |
---|
503 | resultDist[count_]=dist; |
---|
504 | count_++; |
---|
505 | } |
---|
506 | } |
---|
507 | |
---|
508 | int* resultIndex; |
---|
509 | DistanceType* resultDist; |
---|
510 | |
---|
511 | __device__ |
---|
512 | inline void |
---|
513 | setResultLocation( DistanceType* dists, int* index, int thread, int /*stride*/ ) |
---|
514 | { |
---|
515 | resultIndex=index+segment_starts_[thread]; |
---|
516 | resultDist=dists+segment_starts_[thread]; |
---|
517 | } |
---|
518 | |
---|
519 | __device__ |
---|
520 | inline void |
---|
521 | finish() |
---|
522 | { |
---|
523 | if( sorted_ ) { |
---|
524 | flann::cuda::heap::make_heap( resultDist,resultIndex, count_, GreaterThan<DistanceType>() ); |
---|
525 | for( int i=count_-1; i>0; i-- ) { |
---|
526 | flann::cuda::swap( resultDist[0], resultDist[i] ); |
---|
527 | flann::cuda::swap( resultIndex[0], resultIndex[i] ); |
---|
528 | flann::cuda::heap::sift_down( resultDist,resultIndex, 0, i, GreaterThan<DistanceType>() ); |
---|
529 | } |
---|
530 | } |
---|
531 | } |
---|
532 | }; |
---|
533 | } |
---|
534 | } |
---|
535 | |
---|
536 | #endif |
---|