Context Navigation

source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/flann/include/flann/algorithms/lsh_index.h @ 15840

Visit:

Last change on this file since 15840 was 15840, checked in by gkronber, 6 years ago
#2886 added utility console program for clustering of expressions
File size: 16.9 KB

Line
1	/***********************************************************************
2	* Software License Agreement (BSD License)
3	*
4	* Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5	* Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6	*
7	* THE BSD LICENSE
8	*
9	* Redistribution and use in source and binary forms, with or without
10	* modification, are permitted provided that the following conditions
11	* are met:
12	*
13	* 1. Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29	*************************************************************************/
30
31	/***********************************************************************
32	* Author: Vincent Rabaud
33	*************************************************************************/
34
35	#ifndef FLANN_LSH_INDEX_H_
36	#define FLANN_LSH_INDEX_H_
37
38	#include <algorithm>
39	#include <cassert>
40	#include <cstring>
41	#include <map>
42	#include <vector>
43
44	#include "flann/general.h"
45	#include "flann/algorithms/nn_index.h"
46	#include "flann/util/matrix.h"
47	#include "flann/util/result_set.h"
48	#include "flann/util/heap.h"
49	#include "flann/util/lsh_table.h"
50	#include "flann/util/allocator.h"
51	#include "flann/util/random.h"
52	#include "flann/util/saving.h"
53
54	namespace flann
55	{
56
57	struct LshIndexParams : public IndexParams
58	{
59	LshIndexParams(unsigned int table_number = 12, unsigned int key_size = 20, unsigned int multi_probe_level = 2)
60	{
61	(* this)["algorithm"] = FLANN_INDEX_LSH;
62	// The number of hash tables to use
63	(*this)["table_number"] = table_number;
64	// The length of the key in the hash tables
65	(*this)["key_size"] = key_size;
66	// Number of levels to use in multi-probe (0 for standard LSH)
67	(*this)["multi_probe_level"] = multi_probe_level;
68	}
69	};
70
71	/**
72	* Randomized kd-tree index
73	*
74	* Contains the k-d trees and other information for indexing a set of points
75	* for nearest-neighbor matching.
76	*/
77	template<typename Distance>
78	class LshIndex : public NNIndex<Distance>
79	{
80	public:
81	typedef typename Distance::ElementType ElementType;
82	typedef typename Distance::ResultType DistanceType;
83
84	/** Constructor
85	* @param input_data dataset with the input features
86	* @param params parameters passed to the LSH algorithm
87	* @param d the distance used
88	*/
89	LshIndex(const Matrix<ElementType>& input_data, const IndexParams& params = LshIndexParams(),
90	Distance d = Distance()) :
91	dataset_(input_data), index_params_(params), distance_(d)
92	{
93	table_number_ = get_param<unsigned int>(index_params_,"table_number",12);
94	key_size_ = get_param<unsigned int>(index_params_,"key_size",20);
95	multi_probe_level_ = get_param<unsigned int>(index_params_,"multi_probe_level",2);
96
97	feature_size_ = dataset_.cols;
98	fill_xor_mask(0, key_size_, multi_probe_level_, xor_masks_);
99	}
100
101
102	LshIndex(const LshIndex&);
103	LshIndex& operator=(const LshIndex&);
104
105	/**
106	* Builds the index
107	*/
108	void buildIndex()
109	{
110	tables_.resize(table_number_);
111	for (unsigned int i = 0; i < table_number_; ++i) {
112	lsh::LshTable<ElementType>& table = tables_[i];
113	table = lsh::LshTable<ElementType>(feature_size_, key_size_);
114
115	// Add the features to the table
116	table.add(dataset_);
117	}
118	}
119
120	flann_algorithm_t getType() const
121	{
122	return FLANN_INDEX_LSH;
123	}
124
125
126	void saveIndex(FILE* stream)
127	{
128	save_value(stream,table_number_);
129	save_value(stream,key_size_);
130	save_value(stream,multi_probe_level_);
131	save_value(stream, dataset_);
132	}
133
134	void loadIndex(FILE* stream)
135	{
136	load_value(stream, table_number_);
137	load_value(stream, key_size_);
138	load_value(stream, multi_probe_level_);
139	load_value(stream, dataset_);
140	// Building the index is so fast we can afford not storing it
141	buildIndex();
142
143	index_params_["algorithm"] = getType();
144	index_params_["table_number"] = table_number_;
145	index_params_["key_size"] = key_size_;
146	index_params_["multi_probe_level"] = multi_probe_level_;
147	}
148
149	/**
150	* Returns size of index.
151	*/
152	size_t size() const
153	{
154	return dataset_.rows;
155	}
156
157	/**
158	* Returns the length of an index feature.
159	*/
160	size_t veclen() const
161	{
162	return feature_size_;
163	}
164
165	/**
166	* Computes the index memory usage
167	* Returns: memory used by the index
168	*/
169	int usedMemory() const
170	{
171	return dataset_.rows * sizeof(int);
172	}
173
174
175	IndexParams getParameters() const
176	{
177	return index_params_;
178	}
179
180	/**
181	* \brief Perform k-nearest neighbor search
182	* \param[in] queries The query points for which to find the nearest neighbors
183	* \param[out] indices The indices of the nearest neighbors found
184	* \param[out] dists Distances to the nearest neighbors found
185	* \param[in] knn Number of nearest neighbors to return
186	* \param[in] params Search parameters
187	*/
188	virtual int knnSearch(const Matrix<ElementType>& queries,
189	Matrix<int>& indices,
190	Matrix<DistanceType>& dists,
191	size_t knn,
192	const SearchParams& params)
193	{
194	assert(queries.cols == veclen());
195	assert(indices.rows >= queries.rows);
196	assert(dists.rows >= queries.rows);
197	assert(indices.cols >= knn);
198	assert(dists.cols >= knn);
199
200	int count = 0;
201	if (params.use_heap==FLANN_True) {
202	KNNUniqueResultSet<DistanceType> resultSet(knn);
203	for (size_t i = 0; i < queries.rows; i++) {
204	resultSet.clear();
205	findNeighbors(resultSet, queries[i], params);
206	resultSet.copy(indices[i], dists[i], knn, params.sorted);
207	count += resultSet.size();
208	}
209	}
210	else {
211	KNNResultSet<DistanceType> resultSet(knn);
212	for (size_t i = 0; i < queries.rows; i++) {
213	resultSet.clear();
214	findNeighbors(resultSet, queries[i], params);
215	resultSet.copy(indices[i], dists[i], knn, params.sorted);
216	count += resultSet.size();
217	}
218	}
219
220	return count;
221	}
222
223	/**
224	* \brief Perform k-nearest neighbor search
225	* \param[in] queries The query points for which to find the nearest neighbors
226	* \param[out] indices The indices of the nearest neighbors found
227	* \param[out] dists Distances to the nearest neighbors found
228	* \param[in] knn Number of nearest neighbors to return
229	* \param[in] params Search parameters
230	*/
231	virtual int knnSearch(const Matrix<ElementType>& queries,
232	std::vector< std::vector<int> >& indices,
233	std::vector<std::vector<DistanceType> >& dists,
234	size_t knn,
235	const SearchParams& params)
236	{
237	assert(queries.cols == veclen());
238	if (indices.size() < queries.rows ) indices.resize(queries.rows);
239	if (dists.size() < queries.rows ) dists.resize(queries.rows);
240
241	int count = 0;
242	if (params.use_heap==FLANN_True) {
243	KNNUniqueResultSet<DistanceType> resultSet(knn);
244	for (size_t i = 0; i < queries.rows; i++) {
245	resultSet.clear();
246	findNeighbors(resultSet, queries[i], params);
247	size_t n = std::min(resultSet.size(), knn);
248	indices[i].resize(n);
249	dists[i].resize(n);
250	resultSet.copy(&indices[i][0], &dists[i][0], n, params.sorted);
251	count += n;
252	}
253	}
254	else {
255	KNNResultSet<DistanceType> resultSet(knn);
256	for (size_t i = 0; i < queries.rows; i++) {
257	resultSet.clear();
258	findNeighbors(resultSet, queries[i], params);
259	size_t n = std::min(resultSet.size(), knn);
260	indices[i].resize(n);
261	dists[i].resize(n);
262	resultSet.copy(&indices[i][0], &dists[i][0], n, params.sorted);
263	count += n;
264	}
265	}
266
267	return count;
268	}
269
270	/**
271	* Find set of nearest neighbors to vec. Their indices are stored inside
272	* the result object.
273	*
274	* Params:
275	* result = the result object in which the indices of the nearest-neighbors are stored
276	* vec = the vector for which to search the nearest neighbors
277	* maxCheck = the maximum number of restarts (in a best-bin-first manner)
278	*/
279	void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& /searchParams/)
280	{
281	getNeighbors(vec, result);
282	}
283
284	private:
285	/** Defines the comparator on score and index
286	*/
287	typedef std::pair<float, unsigned int> ScoreIndexPair;
288	struct SortScoreIndexPairOnSecond
289	{
290	bool operator()(const ScoreIndexPair& left, const ScoreIndexPair& right) const
291	{
292	return left.second < right.second;
293	}
294	};
295
296	/** Fills the different xor masks to use when getting the neighbors in multi-probe LSH
297	* @param key the key we build neighbors from
298	* @param lowest_index the lowest index of the bit set
299	* @param level the multi-probe level we are at
300	* @param xor_masks all the xor mask
301	*/
302	void fill_xor_mask(lsh::BucketKey key, int lowest_index, unsigned int level,
303	std::vector<lsh::BucketKey>& xor_masks)
304	{
305	xor_masks.push_back(key);
306	if (level == 0) return;
307	for (int index = lowest_index - 1; index >= 0; --index) {
308	// Create a new key
309	lsh::BucketKey new_key = key \| (1 << index);
310	fill_xor_mask(new_key, index, level - 1, xor_masks);
311	}
312	}
313
314	/** Performs the approximate nearest-neighbor search.
315	* @param vec the feature to analyze
316	* @param do_radius flag indicating if we check the radius too
317	* @param radius the radius if it is a radius search
318	* @param do_k flag indicating if we limit the number of nn
319	* @param k_nn the number of nearest neighbors
320	* @param checked_average used for debugging
321	*/
322	void getNeighbors(const ElementType* vec, bool do_radius, float radius, bool do_k, unsigned int k_nn,
323	float& checked_average)
324	{
325	static std::vector<ScoreIndexPair> score_index_heap;
326
327	if (do_k) {
328	unsigned int worst_score = std::numeric_limits<unsigned int>::max();
329	typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
330	typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
331	for (; table != table_end; ++table) {
332	size_t key = table->getKey(vec);
333	std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
334	std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
335	for (; xor_mask != xor_mask_end; ++xor_mask) {
336	size_t sub_key = key ^ (*xor_mask);
337	const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
338	if (bucket == 0) continue;
339
340	// Go over each descriptor index
341	std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
342	std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
343	DistanceType hamming_distance;
344
345	// Process the rest of the candidates
346	for (; training_index < last_training_index; ++training_index) {
347	hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
348
349	if (hamming_distance < worst_score) {
350	// Insert the new element
351	score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
352	std::push_heap(score_index_heap.begin(), score_index_heap.end());
353
354	if (score_index_heap.size() > (unsigned int)k_nn) {
355	// Remove the highest distance value as we have too many elements
356	std::pop_heap(score_index_heap.begin(), score_index_heap.end());
357	score_index_heap.pop_back();
358	// Keep track of the worst score
359	worst_score = score_index_heap.front().first;
360	}
361	}
362	}
363	}
364	}
365	}
366	else {
367	typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
368	typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
369	for (; table != table_end; ++table) {
370	size_t key = table->getKey(vec);
371	std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
372	std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
373	for (; xor_mask != xor_mask_end; ++xor_mask) {
374	size_t sub_key = key ^ (*xor_mask);
375	const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
376	if (bucket == 0) continue;
377
378	// Go over each descriptor index
379	std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
380	std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
381	DistanceType hamming_distance;
382
383	// Process the rest of the candidates
384	for (; training_index < last_training_index; ++training_index) {
385	// Compute the Hamming distance
386	hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
387	if (hamming_distance < radius) score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
388	}
389	}
390	}
391	}
392	}
393
394	/** Performs the approximate nearest-neighbor search.
395	* This is a slower version than the above as it uses the ResultSet
396	* @param vec the feature to analyze
397	*/
398	void getNeighbors(const ElementType* vec, ResultSet<DistanceType>& result)
399	{
400	typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
401	typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
402	for (; table != table_end; ++table) {
403	size_t key = table->getKey(vec);
404	std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
405	std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
406	for (; xor_mask != xor_mask_end; ++xor_mask) {
407	size_t sub_key = key ^ (*xor_mask);
408	const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
409	if (bucket == 0) continue;
410
411	// Go over each descriptor index
412	std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
413	std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
414	DistanceType hamming_distance;
415
416	// Process the rest of the candidates
417	for (; training_index < last_training_index; ++training_index) {
418	// Compute the Hamming distance
419	hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
420	result.addPoint(hamming_distance, *training_index);
421	}
422	}
423	}
424	}
425
426	/** The different hash tables */
427	std::vector<lsh::LshTable<ElementType> > tables_;
428
429	/** The data the LSH tables where built from */
430	Matrix<ElementType> dataset_;
431
432	/** The size of the features (as ElementType[]) */
433	unsigned int feature_size_;
434
435	IndexParams index_params_;
436
437	/** table number */
438	unsigned int table_number_;
439	/** key size */
440	unsigned int key_size_;
441	/** How far should we look for neighbors in multi-probe LSH */
442	unsigned int multi_probe_level_;
443
444	/** The XOR masks to apply to a key to get the neighboring buckets */
445	std::vector<lsh::BucketKey> xor_masks_;
446
447	Distance distance_;
448	};
449	}
450
451	#endif //FLANN_LSH_INDEX_H_

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences