Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/flann/include/flann/algorithms/lsh_index.h @ 15840

Last change on this file since 15840 was 15840, checked in by gkronber, 6 years ago

#2886 added utility console program for clustering of expressions

File size: 16.9 KB
Line 
1/***********************************************************************
2 * Software License Agreement (BSD License)
3 *
4 * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5 * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6 *
7 * THE BSD LICENSE
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *************************************************************************/
30
31/***********************************************************************
32 * Author: Vincent Rabaud
33 *************************************************************************/
34
35#ifndef FLANN_LSH_INDEX_H_
36#define FLANN_LSH_INDEX_H_
37
38#include <algorithm>
39#include <cassert>
40#include <cstring>
41#include <map>
42#include <vector>
43
44#include "flann/general.h"
45#include "flann/algorithms/nn_index.h"
46#include "flann/util/matrix.h"
47#include "flann/util/result_set.h"
48#include "flann/util/heap.h"
49#include "flann/util/lsh_table.h"
50#include "flann/util/allocator.h"
51#include "flann/util/random.h"
52#include "flann/util/saving.h"
53
54namespace flann
55{
56
57struct LshIndexParams : public IndexParams
58{
59    LshIndexParams(unsigned int table_number = 12, unsigned int key_size = 20, unsigned int multi_probe_level = 2)
60    {
61        (* this)["algorithm"] = FLANN_INDEX_LSH;
62        // The number of hash tables to use
63        (*this)["table_number"] = table_number;
64        // The length of the key in the hash tables
65        (*this)["key_size"] = key_size;
66        // Number of levels to use in multi-probe (0 for standard LSH)
67        (*this)["multi_probe_level"] = multi_probe_level;
68    }
69};
70
71/**
72 * Randomized kd-tree index
73 *
74 * Contains the k-d trees and other information for indexing a set of points
75 * for nearest-neighbor matching.
76 */
77template<typename Distance>
78class LshIndex : public NNIndex<Distance>
79{
80public:
81    typedef typename Distance::ElementType ElementType;
82    typedef typename Distance::ResultType DistanceType;
83
84    /** Constructor
85     * @param input_data dataset with the input features
86     * @param params parameters passed to the LSH algorithm
87     * @param d the distance used
88     */
89    LshIndex(const Matrix<ElementType>& input_data, const IndexParams& params = LshIndexParams(),
90             Distance d = Distance()) :
91        dataset_(input_data), index_params_(params), distance_(d)
92    {
93        table_number_ = get_param<unsigned int>(index_params_,"table_number",12);
94        key_size_ = get_param<unsigned int>(index_params_,"key_size",20);
95        multi_probe_level_ = get_param<unsigned int>(index_params_,"multi_probe_level",2);
96
97        feature_size_ = dataset_.cols;
98        fill_xor_mask(0, key_size_, multi_probe_level_, xor_masks_);
99    }
100
101
102    LshIndex(const LshIndex&);
103    LshIndex& operator=(const LshIndex&);
104
105    /**
106     * Builds the index
107     */
108    void buildIndex()
109    {
110        tables_.resize(table_number_);
111        for (unsigned int i = 0; i < table_number_; ++i) {
112            lsh::LshTable<ElementType>& table = tables_[i];
113            table = lsh::LshTable<ElementType>(feature_size_, key_size_);
114
115            // Add the features to the table
116            table.add(dataset_);
117        }
118    }
119
120    flann_algorithm_t getType() const
121    {
122        return FLANN_INDEX_LSH;
123    }
124
125
126    void saveIndex(FILE* stream)
127    {
128        save_value(stream,table_number_);
129        save_value(stream,key_size_);
130        save_value(stream,multi_probe_level_);
131        save_value(stream, dataset_);
132    }
133
134    void loadIndex(FILE* stream)
135    {
136        load_value(stream, table_number_);
137        load_value(stream, key_size_);
138        load_value(stream, multi_probe_level_);
139        load_value(stream, dataset_);
140        // Building the index is so fast we can afford not storing it
141        buildIndex();
142
143        index_params_["algorithm"] = getType();
144        index_params_["table_number"] = table_number_;
145        index_params_["key_size"] = key_size_;
146        index_params_["multi_probe_level"] = multi_probe_level_;
147    }
148
149    /**
150     *  Returns size of index.
151     */
152    size_t size() const
153    {
154        return dataset_.rows;
155    }
156
157    /**
158     * Returns the length of an index feature.
159     */
160    size_t veclen() const
161    {
162        return feature_size_;
163    }
164
165    /**
166     * Computes the index memory usage
167     * Returns: memory used by the index
168     */
169    int usedMemory() const
170    {
171        return dataset_.rows * sizeof(int);
172    }
173
174
175    IndexParams getParameters() const
176    {
177        return index_params_;
178    }
179
180    /**
181     * \brief Perform k-nearest neighbor search
182     * \param[in] queries The query points for which to find the nearest neighbors
183     * \param[out] indices The indices of the nearest neighbors found
184     * \param[out] dists Distances to the nearest neighbors found
185     * \param[in] knn Number of nearest neighbors to return
186     * \param[in] params Search parameters
187     */
188    virtual int knnSearch(const Matrix<ElementType>& queries,
189              Matrix<int>& indices,
190              Matrix<DistanceType>& dists,
191              size_t knn,
192              const SearchParams& params)
193    {
194        assert(queries.cols == veclen());
195        assert(indices.rows >= queries.rows);
196        assert(dists.rows >= queries.rows);
197        assert(indices.cols >= knn);
198        assert(dists.cols >= knn);
199
200        int count = 0;
201        if (params.use_heap==FLANN_True) {
202          KNNUniqueResultSet<DistanceType> resultSet(knn);
203          for (size_t i = 0; i < queries.rows; i++) {
204            resultSet.clear();
205            findNeighbors(resultSet, queries[i], params);
206            resultSet.copy(indices[i], dists[i], knn, params.sorted);
207            count += resultSet.size();
208          }
209        }
210        else {
211          KNNResultSet<DistanceType> resultSet(knn);
212          for (size_t i = 0; i < queries.rows; i++) {
213            resultSet.clear();
214            findNeighbors(resultSet, queries[i], params);
215            resultSet.copy(indices[i], dists[i], knn, params.sorted);
216            count += resultSet.size();
217          }
218        }
219
220        return count;
221    }
222
223    /**
224     * \brief Perform k-nearest neighbor search
225     * \param[in] queries The query points for which to find the nearest neighbors
226     * \param[out] indices The indices of the nearest neighbors found
227     * \param[out] dists Distances to the nearest neighbors found
228     * \param[in] knn Number of nearest neighbors to return
229     * \param[in] params Search parameters
230     */
231    virtual int knnSearch(const Matrix<ElementType>& queries,
232          std::vector< std::vector<int> >& indices,
233          std::vector<std::vector<DistanceType> >& dists,
234            size_t knn,
235            const SearchParams& params)
236    {
237        assert(queries.cols == veclen());
238    if (indices.size() < queries.rows ) indices.resize(queries.rows);
239    if (dists.size() < queries.rows ) dists.resize(queries.rows);
240
241    int count = 0;
242    if (params.use_heap==FLANN_True) {
243      KNNUniqueResultSet<DistanceType> resultSet(knn);
244      for (size_t i = 0; i < queries.rows; i++) {
245        resultSet.clear();
246        findNeighbors(resultSet, queries[i], params);
247        size_t n = std::min(resultSet.size(), knn);
248        indices[i].resize(n);
249        dists[i].resize(n);
250        resultSet.copy(&indices[i][0], &dists[i][0], n, params.sorted);
251        count += n;
252      }
253    }
254    else {
255      KNNResultSet<DistanceType> resultSet(knn);
256      for (size_t i = 0; i < queries.rows; i++) {
257        resultSet.clear();
258        findNeighbors(resultSet, queries[i], params);
259        size_t n = std::min(resultSet.size(), knn);
260        indices[i].resize(n);
261        dists[i].resize(n);
262        resultSet.copy(&indices[i][0], &dists[i][0], n, params.sorted);
263        count += n;
264      }
265    }
266
267    return count;
268    }
269
270    /**
271     * Find set of nearest neighbors to vec. Their indices are stored inside
272     * the result object.
273     *
274     * Params:
275     *     result = the result object in which the indices of the nearest-neighbors are stored
276     *     vec = the vector for which to search the nearest neighbors
277     *     maxCheck = the maximum number of restarts (in a best-bin-first manner)
278     */
279    void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& /*searchParams*/)
280    {
281        getNeighbors(vec, result);
282    }
283
284private:
285    /** Defines the comparator on score and index
286     */
287    typedef std::pair<float, unsigned int> ScoreIndexPair;
288    struct SortScoreIndexPairOnSecond
289    {
290        bool operator()(const ScoreIndexPair& left, const ScoreIndexPair& right) const
291        {
292            return left.second < right.second;
293        }
294    };
295
296    /** Fills the different xor masks to use when getting the neighbors in multi-probe LSH
297     * @param key the key we build neighbors from
298     * @param lowest_index the lowest index of the bit set
299     * @param level the multi-probe level we are at
300     * @param xor_masks all the xor mask
301     */
302    void fill_xor_mask(lsh::BucketKey key, int lowest_index, unsigned int level,
303                       std::vector<lsh::BucketKey>& xor_masks)
304    {
305        xor_masks.push_back(key);
306        if (level == 0) return;
307        for (int index = lowest_index - 1; index >= 0; --index) {
308            // Create a new key
309            lsh::BucketKey new_key = key | (1 << index);
310            fill_xor_mask(new_key, index, level - 1, xor_masks);
311        }
312    }
313
314    /** Performs the approximate nearest-neighbor search.
315     * @param vec the feature to analyze
316     * @param do_radius flag indicating if we check the radius too
317     * @param radius the radius if it is a radius search
318     * @param do_k flag indicating if we limit the number of nn
319     * @param k_nn the number of nearest neighbors
320     * @param checked_average used for debugging
321     */
322    void getNeighbors(const ElementType* vec, bool do_radius, float radius, bool do_k, unsigned int k_nn,
323                      float& checked_average)
324    {
325        static std::vector<ScoreIndexPair> score_index_heap;
326
327        if (do_k) {
328            unsigned int worst_score = std::numeric_limits<unsigned int>::max();
329            typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
330            typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
331            for (; table != table_end; ++table) {
332                size_t key = table->getKey(vec);
333                std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
334                std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
335                for (; xor_mask != xor_mask_end; ++xor_mask) {
336                    size_t sub_key = key ^ (*xor_mask);
337                    const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
338                    if (bucket == 0) continue;
339
340                    // Go over each descriptor index
341                    std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
342                    std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
343                    DistanceType hamming_distance;
344
345                    // Process the rest of the candidates
346                    for (; training_index < last_training_index; ++training_index) {
347                        hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
348
349                        if (hamming_distance < worst_score) {
350                            // Insert the new element
351                            score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
352                            std::push_heap(score_index_heap.begin(), score_index_heap.end());
353
354                            if (score_index_heap.size() > (unsigned int)k_nn) {
355                                // Remove the highest distance value as we have too many elements
356                                std::pop_heap(score_index_heap.begin(), score_index_heap.end());
357                                score_index_heap.pop_back();
358                                // Keep track of the worst score
359                                worst_score = score_index_heap.front().first;
360                            }
361                        }
362                    }
363                }
364            }
365        }
366        else {
367            typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
368            typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
369            for (; table != table_end; ++table) {
370                size_t key = table->getKey(vec);
371                std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
372                std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
373                for (; xor_mask != xor_mask_end; ++xor_mask) {
374                    size_t sub_key = key ^ (*xor_mask);
375                    const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
376                    if (bucket == 0) continue;
377
378                    // Go over each descriptor index
379                    std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
380                    std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
381                    DistanceType hamming_distance;
382
383                    // Process the rest of the candidates
384                    for (; training_index < last_training_index; ++training_index) {
385                        // Compute the Hamming distance
386                        hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
387                        if (hamming_distance < radius) score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
388                    }
389                }
390            }
391        }
392    }
393
394    /** Performs the approximate nearest-neighbor search.
395     * This is a slower version than the above as it uses the ResultSet
396     * @param vec the feature to analyze
397     */
398    void getNeighbors(const ElementType* vec, ResultSet<DistanceType>& result)
399    {
400        typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
401        typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
402        for (; table != table_end; ++table) {
403            size_t key = table->getKey(vec);
404            std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
405            std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
406            for (; xor_mask != xor_mask_end; ++xor_mask) {
407                size_t sub_key = key ^ (*xor_mask);
408                const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
409                if (bucket == 0) continue;
410
411                // Go over each descriptor index
412                std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
413                std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
414                DistanceType hamming_distance;
415
416                // Process the rest of the candidates
417                for (; training_index < last_training_index; ++training_index) {
418                    // Compute the Hamming distance
419                    hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
420                    result.addPoint(hamming_distance, *training_index);
421                }
422            }
423        }
424    }
425
426    /** The different hash tables */
427    std::vector<lsh::LshTable<ElementType> > tables_;
428
429    /** The data the LSH tables where built from */
430    Matrix<ElementType> dataset_;
431
432    /** The size of the features (as ElementType[]) */
433    unsigned int feature_size_;
434
435    IndexParams index_params_;
436
437    /** table number */
438    unsigned int table_number_;
439    /** key size */
440    unsigned int key_size_;
441    /** How far should we look for neighbors in multi-probe LSH */
442    unsigned int multi_probe_level_;
443
444    /** The XOR masks to apply to a key to get the neighboring buckets */
445    std::vector<lsh::BucketKey> xor_masks_;
446
447    Distance distance_;
448};
449}
450
451#endif //FLANN_LSH_INDEX_H_
Note: See TracBrowser for help on using the repository browser.