Context Navigation

source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/flann/include/flann/util/lsh_table.h @ 15840

Visit:

Last change on this file since 15840 was 15840, checked in by gkronber, 6 years ago
#2886 added utility console program for clustering of expressions
File size: 16.7 KB

Line
1	/***********************************************************************
2	* Software License Agreement (BSD License)
3	*
4	* Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5	* Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6	*
7	* THE BSD LICENSE
8	*
9	* Redistribution and use in source and binary forms, with or without
10	* modification, are permitted provided that the following conditions
11	* are met:
12	*
13	* 1. Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29	*************************************************************************/
30
31	/***********************************************************************
32	* Author: Vincent Rabaud
33	*************************************************************************/
34
35	#ifndef FLANN_LSH_TABLE_H_
36	#define FLANN_LSH_TABLE_H_
37
38	#include <algorithm>
39	#include <iostream>
40	#include <iomanip>
41	#include <limits.h>
42	// TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
43	#if USE_UNORDERED_MAP
44	#include <unordered_map>
45	#else
46	#include <map>
47	#endif
48	#include <math.h>
49	#include <stddef.h>
50
51	#include "flann/util/dynamic_bitset.h"
52	#include "flann/util/matrix.h"
53
54	namespace flann
55	{
56
57	namespace lsh
58	{
59
60	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
61
62	/** What is stored in an LSH bucket
63	*/
64	typedef uint32_t FeatureIndex;
65	/** The id from which we can get a bucket back in an LSH table
66	*/
67	typedef unsigned int BucketKey;
68
69	/** A bucket in an LSH table
70	*/
71	typedef std::vector<FeatureIndex> Bucket;
72
73	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
74
75	/** POD for stats about an LSH table
76	*/
77	struct LshStats
78	{
79	std::vector<unsigned int> bucket_sizes_;
80	size_t n_buckets_;
81	size_t bucket_size_mean_;
82	size_t bucket_size_median_;
83	size_t bucket_size_min_;
84	size_t bucket_size_max_;
85	size_t bucket_size_std_dev;
86	/** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
87	*/
88	std::vector<std::vector<unsigned int> > size_histogram_;
89	};
90
91	/** Overload the << operator for LshStats
92	* @param out the streams
93	* @param stats the stats to display
94	* @return the streams
95	*/
96	inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
97	{
98	size_t w = 20;
99	out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
100	<< stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
101	<< std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
102	<< std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
103	<< std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
104	<< stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
105	<< std::setiosflags(std::ios::left) << stats.bucket_size_max_;
106
107	// Display the histogram
108	out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
109	<< std::setiosflags(std::ios::left);
110	for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
111	stats.size_histogram_.end(); iterator != end; ++iterator) out << (iterator)[0] << "-" << (iterator)[1] << ": " << (*iterator)[2] << ", ";
112
113	return out;
114	}
115
116
117	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
118
119	/** Lsh hash table. As its key is a sub-feature, and as usually
120	* the size of it is pretty small, we keep it as a continuous memory array.
121	* The value is an index in the corpus of features (we keep it as an unsigned
122	* int for pure memory reasons, it could be a size_t)
123	*/
124	template<typename ElementType>
125	class LshTable
126	{
127	public:
128	/** A container of all the feature indices. Optimized for space
129	*/
130	#if USE_UNORDERED_MAP
131	typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
132	#else
133	typedef std::map<BucketKey, Bucket> BucketsSpace;
134	#endif
135
136	/** A container of all the feature indices. Optimized for speed
137	*/
138	typedef std::vector<Bucket> BucketsSpeed;
139
140	/** Default constructor
141	*/
142	LshTable()
143	{
144	}
145
146	/** Default constructor
147	* Create the mask and allocate the memory
148	* @param feature_size is the size of the feature (considered as a ElementType[])
149	* @param key_size is the number of bits that are turned on in the feature
150	*/
151	LshTable(unsigned int /feature_size/, unsigned int /key_size/)
152	{
153	std::cerr << "LSH is not implemented for that type" << std::endl;
154	throw;
155	}
156
157	/** Add a feature to the table
158	* @param value the value to store for that feature
159	* @param feature the feature itself
160	*/
161	void add(unsigned int value, const ElementType* feature)
162	{
163	// Add the value to the corresponding bucket
164	BucketKey key = getKey(feature);
165
166	switch (speed_level_) {
167	case kArray:
168	// That means we get the buckets from an array
169	buckets_speed_[key].push_back(value);
170	break;
171	case kBitsetHash:
172	// That means we can check the bitset for the presence of a key
173	key_bitset_.set(key);
174	buckets_space_[key].push_back(value);
175	break;
176	case kHash:
177	{
178	// That means we have to check for the hash table for the presence of a key
179	buckets_space_[key].push_back(value);
180	break;
181	}
182	}
183	}
184
185	/** Add a set of features to the table
186	* @param dataset the values to store
187	*/
188	void add(Matrix<ElementType> dataset)
189	{
190	#if USE_UNORDERED_MAP
191	if (!use_speed_) buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
192	#endif
193	// Add the features to the table
194	for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
195	// Now that the table is full, optimize it for speed/space
196	optimize();
197	}
198
199	/** Get a bucket given the key
200	* @param key
201	* @return
202	*/
203	inline const Bucket* getBucketFromKey(BucketKey key) const
204	{
205	// Generate other buckets
206	switch (speed_level_) {
207	case kArray:
208	// That means we get the buckets from an array
209	return &buckets_speed_[key];
210	break;
211	case kBitsetHash:
212	// That means we can check the bitset for the presence of a key
213	if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
214	else return 0;
215	break;
216	case kHash:
217	{
218	// That means we have to check for the hash table for the presence of a key
219	BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
220	bucket_it = buckets_space_.find(key);
221	// Stop here if that bucket does not exist
222	if (bucket_it == bucket_end) return 0;
223	else return &bucket_it->second;
224	break;
225	}
226	}
227	return 0;
228	}
229
230	/** Compute the sub-signature of a feature
231	*/
232	size_t getKey(const ElementType* /feature/) const
233	{
234	std::cerr << "LSH is not implemented for that type" << std::endl;
235	throw;
236	return 1;
237	}
238
239	/** Get statistics about the table
240	* @return
241	*/
242	LshStats getStats() const;
243
244	private:
245	/** defines the speed fo the implementation
246	* kArray uses a vector for storing data
247	* kBitsetHash uses a hash map but checks for the validity of a key with a bitset
248	* kHash uses a hash map only
249	*/
250	enum SpeedLevel
251	{
252	kArray, kBitsetHash, kHash
253	};
254
255	/** Initialize some variables
256	*/
257	void initialize(size_t key_size)
258	{
259	speed_level_ = kHash;
260	key_size_ = key_size;
261	}
262
263	/** Optimize the table for speed/space
264	*/
265	void optimize()
266	{
267	// If we are already using the fast storage, no need to do anything
268	if (speed_level_ == kArray) return;
269
270	// Use an array if it will be more than half full
271	if (buckets_space_.size() > (unsigned int)((1 << key_size_) / 2)) {
272	speed_level_ = kArray;
273	// Fill the array version of it
274	buckets_speed_.resize(1 << key_size_);
275	for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
276
277	// Empty the hash table
278	buckets_space_.clear();
279	return;
280	}
281
282	// If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
283	// for the vector) or less than 512MB (key_size_ <= 30)
284	if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
285	>= size_t(1 << key_size_)) \|\| (key_size_ <= 32)) {
286	speed_level_ = kBitsetHash;
287	key_bitset_.resize(1 << key_size_);
288	key_bitset_.reset();
289	// Try with the BucketsSpace
290	for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
291	}
292	else {
293	speed_level_ = kHash;
294	key_bitset_.clear();
295	}
296	}
297
298	/** The vector of all the buckets if they are held for speed
299	*/
300	BucketsSpeed buckets_speed_;
301
302	/** The hash table of all the buckets in case we cannot use the speed version
303	*/
304	BucketsSpace buckets_space_;
305
306	/** What is used to store the data */
307	SpeedLevel speed_level_;
308
309	/** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
310	* That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
311	*/
312	DynamicBitset key_bitset_;
313
314	/** The size of the sub-signature in bits
315	*/
316	unsigned int key_size_;
317
318	// Members only used for the unsigned char specialization
319	/** The mask to apply to a feature to get the hash key
320	* Only used in the unsigned char case
321	*/
322	std::vector<size_t> mask_;
323	};
324
325	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
326	// Specialization for unsigned char
327
328	template<>
329	inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
330	{
331	initialize(subsignature_size);
332	// Allocate the mask
333	mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0);
334
335	// A bit brutal but fast to code
336	std::vector<size_t> indices(feature_size * CHAR_BIT);
337	for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i;
338	std::random_shuffle(indices.begin(), indices.end());
339
340	// Generate a random set of order of subsignature_size_ bits
341	for (unsigned int i = 0; i < key_size_; ++i) {
342	size_t index = indices[i];
343
344	// Set that bit in the mask
345	size_t divisor = CHAR_BIT * sizeof(size_t);
346	size_t idx = index / divisor; //pick the right size_t index
347	mask_[idx] \|= size_t(1) << (index % divisor); //use modulo to find the bit offset
348	}
349
350	// Set to 1 if you want to display the mask for debug
351	#if 0
352	{
353	size_t bcount = 0;
354	BOOST_FOREACH(size_t mask_block, mask_){
355	out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
356	<< std::endl;
357	bcount += __builtin_popcountll(mask_block);
358	}
359	out << "bit count : " << std::dec << bcount << std::endl;
360	out << "mask size : " << mask_.size() << std::endl;
361	return out;
362	}
363	#endif
364	}
365
366	/** Return the Subsignature of a feature
367	* @param feature the feature to analyze
368	*/
369	template<>
370	inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
371	{
372	// no need to check if T is dividable by sizeof(size_t) like in the Hamming
373	// distance computation as we have a mask
374	const size_t* feature_block_ptr = reinterpret_cast<const size_t*> (feature);
375
376	// Figure out the subsignature of the feature
377	// Given the feature ABCDEF, and the mask 001011, the output will be
378	// 000CEF
379	size_t subsignature = 0;
380	size_t bit_index = 1;
381
382	for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) {
383	// get the mask and signature blocks
384	size_t feature_block = *feature_block_ptr;
385	size_t mask_block = *pmask_block;
386	while (mask_block) {
387	// Get the lowest set bit in the mask block
388	size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
389	// Add it to the current subsignature if necessary
390	subsignature += (feature_block & lowest_bit) ? bit_index : 0;
391	// Reset the bit in the mask block
392	mask_block ^= lowest_bit;
393	// increment the bit index for the subsignature
394	bit_index <<= 1;
395	}
396	// Check the next feature block
397	++feature_block_ptr;
398	}
399	return subsignature;
400	}
401
402	template<>
403	inline LshStats LshTable<unsigned char>::getStats() const
404	{
405	LshStats stats;
406	stats.bucket_size_mean_ = 0;
407	if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
408	stats.n_buckets_ = 0;
409	stats.bucket_size_median_ = 0;
410	stats.bucket_size_min_ = 0;
411	stats.bucket_size_max_ = 0;
412	return stats;
413	}
414
415	if (!buckets_speed_.empty()) {
416	for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
417	stats.bucket_sizes_.push_back(pbucket->size());
418	stats.bucket_size_mean_ += pbucket->size();
419	}
420	stats.bucket_size_mean_ /= buckets_speed_.size();
421	stats.n_buckets_ = buckets_speed_.size();
422	}
423	else {
424	for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
425	stats.bucket_sizes_.push_back(x->second.size());
426	stats.bucket_size_mean_ += x->second.size();
427	}
428	stats.bucket_size_mean_ /= buckets_space_.size();
429	stats.n_buckets_ = buckets_space_.size();
430	}
431
432	std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
433
434	// BOOST_FOREACH(int size, stats.bucket_sizes_)
435	// std::cout << size << " ";
436	// std::cout << std::endl;
437	stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
438	stats.bucket_size_min_ = stats.bucket_sizes_.front();
439	stats.bucket_size_max_ = stats.bucket_sizes_.back();
440
441	// TODO compute mean and std
442	/*float mean, stddev;
443	stats.bucket_size_mean_ = mean;
444	stats.bucket_size_std_dev = stddev;*/
445
446	// Include a histogram of the buckets
447	unsigned int bin_start = 0;
448	unsigned int bin_end = 20;
449	bool is_new_bin = true;
450	for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
451	!= end; )
452	if (*iterator < bin_end) {
453	if (is_new_bin) {
454	stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
455	stats.size_histogram_.back()[0] = bin_start;
456	stats.size_histogram_.back()[1] = bin_end - 1;
457	is_new_bin = false;
458	}
459	++stats.size_histogram_.back()[2];
460	++iterator;
461	}
462	else {
463	bin_start += 20;
464	bin_end += 20;
465	is_new_bin = true;
466	}
467
468	return stats;
469	}
470
471	// End the two namespaces
472	}
473	}
474
475	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
476
477	#endif /* FLANN_LSH_TABLE_H_ */

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences