Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/flann/include/flann/util/lsh_table.h @ 15840

Last change on this file since 15840 was 15840, checked in by gkronber, 6 years ago

#2886 added utility console program for clustering of expressions

File size: 16.7 KB
Line 
1/***********************************************************************
2 * Software License Agreement (BSD License)
3 *
4 * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5 * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6 *
7 * THE BSD LICENSE
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *************************************************************************/
30
31/***********************************************************************
32 * Author: Vincent Rabaud
33 *************************************************************************/
34
35#ifndef FLANN_LSH_TABLE_H_
36#define FLANN_LSH_TABLE_H_
37
38#include <algorithm>
39#include <iostream>
40#include <iomanip>
41#include <limits.h>
42// TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
43#if USE_UNORDERED_MAP
44#include <unordered_map>
45#else
46#include <map>
47#endif
48#include <math.h>
49#include <stddef.h>
50
51#include "flann/util/dynamic_bitset.h"
52#include "flann/util/matrix.h"
53
54namespace flann
55{
56
57namespace lsh
58{
59
60////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
61
62/** What is stored in an LSH bucket
63 */
64typedef uint32_t FeatureIndex;
65/** The id from which we can get a bucket back in an LSH table
66 */
67typedef unsigned int BucketKey;
68
69/** A bucket in an LSH table
70 */
71typedef std::vector<FeatureIndex> Bucket;
72
73////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
74
75/** POD for stats about an LSH table
76 */
77struct LshStats
78{
79    std::vector<unsigned int> bucket_sizes_;
80    size_t n_buckets_;
81    size_t bucket_size_mean_;
82    size_t bucket_size_median_;
83    size_t bucket_size_min_;
84    size_t bucket_size_max_;
85    size_t bucket_size_std_dev;
86    /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
87     */
88    std::vector<std::vector<unsigned int> > size_histogram_;
89};
90
91/** Overload the << operator for LshStats
92 * @param out the streams
93 * @param stats the stats to display
94 * @return the streams
95 */
96inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
97{
98    size_t w = 20;
99    out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
100    << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
101    << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
102    << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
103    << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
104    << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
105    << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
106
107    // Display the histogram
108    out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
109    << std::setiosflags(std::ios::left);
110    for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
111             stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ",  ";
112
113    return out;
114}
115
116
117////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
118
119/** Lsh hash table. As its key is a sub-feature, and as usually
120 * the size of it is pretty small, we keep it as a continuous memory array.
121 * The value is an index in the corpus of features (we keep it as an unsigned
122 * int for pure memory reasons, it could be a size_t)
123 */
124template<typename ElementType>
125class LshTable
126{
127public:
128    /** A container of all the feature indices. Optimized for space
129     */
130#if USE_UNORDERED_MAP
131    typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
132#else
133    typedef std::map<BucketKey, Bucket> BucketsSpace;
134#endif
135
136    /** A container of all the feature indices. Optimized for speed
137     */
138    typedef std::vector<Bucket> BucketsSpeed;
139
140    /** Default constructor
141     */
142    LshTable()
143    {
144    }
145
146    /** Default constructor
147     * Create the mask and allocate the memory
148     * @param feature_size is the size of the feature (considered as a ElementType[])
149     * @param key_size is the number of bits that are turned on in the feature
150     */
151    LshTable(unsigned int /*feature_size*/, unsigned int /*key_size*/)
152    {
153        std::cerr << "LSH is not implemented for that type" << std::endl;
154        throw;
155    }
156
157    /** Add a feature to the table
158     * @param value the value to store for that feature
159     * @param feature the feature itself
160     */
161    void add(unsigned int value, const ElementType* feature)
162    {
163        // Add the value to the corresponding bucket
164        BucketKey key = getKey(feature);
165
166        switch (speed_level_) {
167        case kArray:
168            // That means we get the buckets from an array
169            buckets_speed_[key].push_back(value);
170            break;
171        case kBitsetHash:
172            // That means we can check the bitset for the presence of a key
173            key_bitset_.set(key);
174            buckets_space_[key].push_back(value);
175            break;
176        case kHash:
177        {
178            // That means we have to check for the hash table for the presence of a key
179            buckets_space_[key].push_back(value);
180            break;
181        }
182        }
183    }
184
185    /** Add a set of features to the table
186     * @param dataset the values to store
187     */
188    void add(Matrix<ElementType> dataset)
189    {
190#if USE_UNORDERED_MAP
191        if (!use_speed_) buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
192#endif
193        // Add the features to the table
194        for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
195        // Now that the table is full, optimize it for speed/space
196        optimize();
197    }
198
199    /** Get a bucket given the key
200     * @param key
201     * @return
202     */
203    inline const Bucket* getBucketFromKey(BucketKey key) const
204    {
205        // Generate other buckets
206        switch (speed_level_) {
207        case kArray:
208            // That means we get the buckets from an array
209            return &buckets_speed_[key];
210            break;
211        case kBitsetHash:
212            // That means we can check the bitset for the presence of a key
213            if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
214            else return 0;
215            break;
216        case kHash:
217        {
218            // That means we have to check for the hash table for the presence of a key
219            BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
220            bucket_it = buckets_space_.find(key);
221            // Stop here if that bucket does not exist
222            if (bucket_it == bucket_end) return 0;
223            else return &bucket_it->second;
224            break;
225        }
226        }
227        return 0;
228    }
229
230    /** Compute the sub-signature of a feature
231     */
232    size_t getKey(const ElementType* /*feature*/) const
233    {
234        std::cerr << "LSH is not implemented for that type" << std::endl;
235        throw;
236        return 1;
237    }
238
239    /** Get statistics about the table
240     * @return
241     */
242    LshStats getStats() const;
243
244private:
245    /** defines the speed fo the implementation
246     * kArray uses a vector for storing data
247     * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
248     * kHash uses a hash map only
249     */
250    enum SpeedLevel
251    {
252        kArray, kBitsetHash, kHash
253    };
254
255    /** Initialize some variables
256     */
257    void initialize(size_t key_size)
258    {
259        speed_level_ = kHash;
260        key_size_ = key_size;
261    }
262
263    /** Optimize the table for speed/space
264     */
265    void optimize()
266    {
267        // If we are already using the fast storage, no need to do anything
268        if (speed_level_ == kArray) return;
269
270        // Use an array if it will be more than half full
271        if (buckets_space_.size() > (unsigned int)((1 << key_size_) / 2)) {
272            speed_level_ = kArray;
273            // Fill the array version of it
274            buckets_speed_.resize(1 << key_size_);
275            for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
276
277            // Empty the hash table
278            buckets_space_.clear();
279            return;
280        }
281
282        // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
283        // for the vector) or less than 512MB (key_size_ <= 30)
284        if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
285             >= size_t(1 << key_size_)) || (key_size_ <= 32)) {
286            speed_level_ = kBitsetHash;
287            key_bitset_.resize(1 << key_size_);
288            key_bitset_.reset();
289            // Try with the BucketsSpace
290            for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
291        }
292        else {
293            speed_level_ = kHash;
294            key_bitset_.clear();
295        }
296    }
297
298    /** The vector of all the buckets if they are held for speed
299     */
300    BucketsSpeed buckets_speed_;
301
302    /** The hash table of all the buckets in case we cannot use the speed version
303     */
304    BucketsSpace buckets_space_;
305
306    /** What is used to store the data */
307    SpeedLevel speed_level_;
308
309    /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
310     * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
311     */
312    DynamicBitset key_bitset_;
313
314    /** The size of the sub-signature in bits
315     */
316    unsigned int key_size_;
317
318    // Members only used for the unsigned char specialization
319    /** The mask to apply to a feature to get the hash key
320     * Only used in the unsigned char case
321     */
322    std::vector<size_t> mask_;
323};
324
325////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
326// Specialization for unsigned char
327
328template<>
329inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
330{
331    initialize(subsignature_size);
332    // Allocate the mask
333    mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0);
334
335    // A bit brutal but fast to code
336    std::vector<size_t> indices(feature_size * CHAR_BIT);
337    for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i;
338    std::random_shuffle(indices.begin(), indices.end());
339
340    // Generate a random set of order of subsignature_size_ bits
341    for (unsigned int i = 0; i < key_size_; ++i) {
342        size_t index = indices[i];
343
344        // Set that bit in the mask
345        size_t divisor = CHAR_BIT * sizeof(size_t);
346        size_t idx = index / divisor; //pick the right size_t index
347        mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
348    }
349
350    // Set to 1 if you want to display the mask for debug
351#if 0
352    {
353        size_t bcount = 0;
354        BOOST_FOREACH(size_t mask_block, mask_){
355            out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
356                << std::endl;
357            bcount += __builtin_popcountll(mask_block);
358        }
359        out << "bit count : " << std::dec << bcount << std::endl;
360        out << "mask size : " << mask_.size() << std::endl;
361        return out;
362    }
363#endif
364}
365
366/** Return the Subsignature of a feature
367 * @param feature the feature to analyze
368 */
369template<>
370inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
371{
372    // no need to check if T is dividable by sizeof(size_t) like in the Hamming
373    // distance computation as we have a mask
374    const size_t* feature_block_ptr = reinterpret_cast<const size_t*> (feature);
375
376    // Figure out the subsignature of the feature
377    // Given the feature ABCDEF, and the mask 001011, the output will be
378    // 000CEF
379    size_t subsignature = 0;
380    size_t bit_index = 1;
381
382    for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) {
383        // get the mask and signature blocks
384        size_t feature_block = *feature_block_ptr;
385        size_t mask_block = *pmask_block;
386        while (mask_block) {
387            // Get the lowest set bit in the mask block
388            size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
389            // Add it to the current subsignature if necessary
390            subsignature += (feature_block & lowest_bit) ? bit_index : 0;
391            // Reset the bit in the mask block
392            mask_block ^= lowest_bit;
393            // increment the bit index for the subsignature
394            bit_index <<= 1;
395        }
396        // Check the next feature block
397        ++feature_block_ptr;
398    }
399    return subsignature;
400}
401
402template<>
403inline LshStats LshTable<unsigned char>::getStats() const
404{
405    LshStats stats;
406    stats.bucket_size_mean_ = 0;
407    if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
408        stats.n_buckets_ = 0;
409        stats.bucket_size_median_ = 0;
410        stats.bucket_size_min_ = 0;
411        stats.bucket_size_max_ = 0;
412        return stats;
413    }
414
415    if (!buckets_speed_.empty()) {
416        for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
417            stats.bucket_sizes_.push_back(pbucket->size());
418            stats.bucket_size_mean_ += pbucket->size();
419        }
420        stats.bucket_size_mean_ /= buckets_speed_.size();
421        stats.n_buckets_ = buckets_speed_.size();
422    }
423    else {
424        for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
425            stats.bucket_sizes_.push_back(x->second.size());
426            stats.bucket_size_mean_ += x->second.size();
427        }
428        stats.bucket_size_mean_ /= buckets_space_.size();
429        stats.n_buckets_ = buckets_space_.size();
430    }
431
432    std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
433
434    //  BOOST_FOREACH(int size, stats.bucket_sizes_)
435    //          std::cout << size << " ";
436    //  std::cout << std::endl;
437    stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
438    stats.bucket_size_min_ = stats.bucket_sizes_.front();
439    stats.bucket_size_max_ = stats.bucket_sizes_.back();
440
441    // TODO compute mean and std
442    /*float mean, stddev;
443       stats.bucket_size_mean_ = mean;
444       stats.bucket_size_std_dev = stddev;*/
445
446    // Include a histogram of the buckets
447    unsigned int bin_start = 0;
448    unsigned int bin_end = 20;
449    bool is_new_bin = true;
450    for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
451         != end; )
452        if (*iterator < bin_end) {
453            if (is_new_bin) {
454                stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
455                stats.size_histogram_.back()[0] = bin_start;
456                stats.size_histogram_.back()[1] = bin_end - 1;
457                is_new_bin = false;
458            }
459            ++stats.size_histogram_.back()[2];
460            ++iterator;
461        }
462        else {
463            bin_start += 20;
464            bin_end += 20;
465            is_new_bin = true;
466        }
467
468    return stats;
469}
470
471// End the two namespaces
472}
473}
474
475////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
476
477#endif /* FLANN_LSH_TABLE_H_ */
Note: See TracBrowser for help on using the repository browser.