Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/flann/include/flann/algorithms/autotuned_index.h @ 15840

Last change on this file since 15840 was 15840, checked in by gkronber, 6 years ago

#2886 added utility console program for clustering of expressions

File size: 20.2 KB
Line 
1/***********************************************************************
2 * Software License Agreement (BSD License)
3 *
4 * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5 * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6 *
7 * THE BSD LICENSE
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *************************************************************************/
30#ifndef FLANN_AUTOTUNED_INDEX_H_
31#define FLANN_AUTOTUNED_INDEX_H_
32
33#include "flann/general.h"
34#include "flann/algorithms/nn_index.h"
35#include "flann/nn/ground_truth.h"
36#include "flann/nn/index_testing.h"
37#include "flann/util/sampling.h"
38#include "flann/algorithms/kdtree_index.h"
39#include "flann/algorithms/kdtree_single_index.h"
40#include "flann/algorithms/kmeans_index.h"
41#include "flann/algorithms/composite_index.h"
42#include "flann/algorithms/linear_index.h"
43#include "flann/util/logger.h"
44
45namespace flann
46{
47
48template<typename Distance>
49NNIndex<Distance>* create_index_by_type(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance);
50
51
52struct AutotunedIndexParams : public IndexParams
53{
54    AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01, float memory_weight = 0, float sample_fraction = 0.1)
55    {
56        (*this)["algorithm"] = FLANN_INDEX_AUTOTUNED;
57        // precision desired (used for autotuning, -1 otherwise)
58        (*this)["target_precision"] = target_precision;
59        // build tree time weighting factor
60        (*this)["build_weight"] = build_weight;
61        // index memory weighting factor
62        (*this)["memory_weight"] = memory_weight;
63        // what fraction of the dataset to use for autotuning
64        (*this)["sample_fraction"] = sample_fraction;
65    }
66};
67
68
69template <typename Distance>
70class AutotunedIndex : public NNIndex<Distance>
71{
72public:
73    typedef typename Distance::ElementType ElementType;
74    typedef typename Distance::ResultType DistanceType;
75
76    typedef bool needs_kdtree_distance;
77
78    AutotunedIndex(const Matrix<ElementType>& inputData, const IndexParams& params = AutotunedIndexParams(), Distance d = Distance()) :
79        dataset_(inputData), distance_(d)
80    {
81        target_precision_ = get_param(params, "target_precision",0.8f);
82        build_weight_ =  get_param(params,"build_weight", 0.01f);
83        memory_weight_ = get_param(params, "memory_weight", 0.0f);
84        sample_fraction_ = get_param(params,"sample_fraction", 0.1f);
85        bestIndex_ = NULL;
86    }
87
88    AutotunedIndex(const AutotunedIndex&);
89    AutotunedIndex& operator=(const AutotunedIndex&);
90
91    virtual ~AutotunedIndex()
92    {
93        if (bestIndex_ != NULL) {
94            delete bestIndex_;
95            bestIndex_ = NULL;
96        }
97    }
98
99    /**
100     *          Method responsible with building the index.
101     */
102    virtual void buildIndex()
103    {
104        bestParams_ = estimateBuildParams();
105        Logger::info("----------------------------------------------------\n");
106        Logger::info("Autotuned parameters:\n");
107        print_params(bestParams_);
108        Logger::info("----------------------------------------------------\n");
109
110        bestIndex_ = create_index_by_type(dataset_, bestParams_, distance_);
111        bestIndex_->buildIndex();
112        speedup_ = estimateSearchParams(bestSearchParams_);
113        Logger::info("----------------------------------------------------\n");
114        Logger::info("Search parameters:\n");
115        print_params(bestSearchParams_);
116        Logger::info("----------------------------------------------------\n");
117    }
118
119    /**
120     *  Saves the index to a stream
121     */
122    virtual void saveIndex(FILE* stream)
123    {
124        save_value(stream, (int)bestIndex_->getType());
125        bestIndex_->saveIndex(stream);
126        save_value(stream, bestSearchParams_.checks);
127    }
128
129    /**
130     *  Loads the index from a stream
131     */
132    virtual void loadIndex(FILE* stream)
133    {
134        int index_type;
135
136        load_value(stream, index_type);
137        IndexParams params;
138        params["algorithm"] = (flann_algorithm_t)index_type;
139        bestIndex_ = create_index_by_type<Distance>(dataset_, params, distance_);
140        bestIndex_->loadIndex(stream);
141        load_value(stream, bestSearchParams_.checks);
142    }
143
144    /**
145     *      Method that searches for nearest-neighbors
146     */
147    virtual void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)
148    {
149        if (searchParams.checks == FLANN_CHECKS_AUTOTUNED) {
150            bestIndex_->findNeighbors(result, vec, bestSearchParams_);
151        }
152        else {
153            bestIndex_->findNeighbors(result, vec, searchParams);
154        }
155    }
156
157
158    IndexParams getParameters() const
159    {
160        return bestIndex_->getParameters();
161    }
162
163    SearchParams getSearchParameters() const
164    {
165        return bestSearchParams_;
166    }
167
168    float getSpeedup() const
169    {
170        return speedup_;
171    }
172
173
174    /**
175     *      Number of features in this index.
176     */
177    virtual size_t size() const
178    {
179        return bestIndex_->size();
180    }
181
182    /**
183     *  The length of each vector in this index.
184     */
185    virtual size_t veclen() const
186    {
187        return bestIndex_->veclen();
188    }
189
190    /**
191     * The amount of memory (in bytes) this index uses.
192     */
193    virtual int usedMemory() const
194    {
195        return bestIndex_->usedMemory();
196    }
197
198    /**
199     * Algorithm name
200     */
201    virtual flann_algorithm_t getType() const
202    {
203        return FLANN_INDEX_AUTOTUNED;
204    }
205
206private:
207
208    struct CostData
209    {
210        float searchTimeCost;
211        float buildTimeCost;
212        float memoryCost;
213        float totalCost;
214        IndexParams params;
215    };
216
217    void evaluate_kmeans(CostData& cost)
218    {
219        StartStopTimer t;
220        int checks;
221        const int nn = 1;
222
223        Logger::info("KMeansTree using params: max_iterations=%d, branching=%d\n",
224                     get_param<int>(cost.params,"iterations"),
225                     get_param<int>(cost.params,"branching"));
226        KMeansIndex<Distance> kmeans(sampledDataset_, cost.params, distance_);
227        // measure index build time
228        t.start();
229        kmeans.buildIndex();
230        t.stop();
231        float buildTime = (float)t.value;
232
233        // measure search time
234        float searchTime = test_index_precision(kmeans, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
235
236        float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
237        cost.memoryCost = (kmeans.usedMemory() + datasetMemory) / datasetMemory;
238        cost.searchTimeCost = searchTime;
239        cost.buildTimeCost = buildTime;
240        Logger::info("KMeansTree buildTime=%g, searchTime=%g, build_weight=%g\n", buildTime, searchTime, build_weight_);
241    }
242
243
244    void evaluate_kdtree(CostData& cost)
245    {
246        StartStopTimer t;
247        int checks;
248        const int nn = 1;
249
250        Logger::info("KDTree using params: trees=%d\n", get_param<int>(cost.params,"trees"));
251        KDTreeIndex<Distance> kdtree(sampledDataset_, cost.params, distance_);
252
253        t.start();
254        kdtree.buildIndex();
255        t.stop();
256        float buildTime = (float)t.value;
257
258        //measure search time
259        float searchTime = test_index_precision(kdtree, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
260
261        float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
262        cost.memoryCost = (kdtree.usedMemory() + datasetMemory) / datasetMemory;
263        cost.searchTimeCost = searchTime;
264        cost.buildTimeCost = buildTime;
265        Logger::info("KDTree buildTime=%g, searchTime=%g\n", buildTime, searchTime);
266    }
267
268
269    //    struct KMeansSimpleDownhillFunctor {
270    //
271    //        Autotune& autotuner;
272    //        KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
273    //
274    //        float operator()(int* params) {
275    //
276    //            float maxFloat = numeric_limits<float>::max();
277    //
278    //            if (params[0]<2) return maxFloat;
279    //            if (params[1]<0) return maxFloat;
280    //
281    //            CostData c;
282    //            c.params["algorithm"] = KMEANS;
283    //            c.params["centers-init"] = CENTERS_RANDOM;
284    //            c.params["branching"] = params[0];
285    //            c.params["max-iterations"] = params[1];
286    //
287    //            autotuner.evaluate_kmeans(c);
288    //
289    //            return c.timeCost;
290    //
291    //        }
292    //    };
293    //
294    //    struct KDTreeSimpleDownhillFunctor {
295    //
296    //        Autotune& autotuner;
297    //        KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
298    //
299    //        float operator()(int* params) {
300    //            float maxFloat = numeric_limits<float>::max();
301    //
302    //            if (params[0]<1) return maxFloat;
303    //
304    //            CostData c;
305    //            c.params["algorithm"] = KDTREE;
306    //            c.params["trees"] = params[0];
307    //
308    //            autotuner.evaluate_kdtree(c);
309    //
310    //            return c.timeCost;
311    //
312    //        }
313    //    };
314
315
316
317    void optimizeKMeans(std::vector<CostData>& costs)
318    {
319        Logger::info("KMEANS, Step 1: Exploring parameter space\n");
320
321        // explore kmeans parameters space using combinations of the parameters below
322        int maxIterations[] = { 1, 5, 10, 15 };
323        int branchingFactors[] = { 16, 32, 64, 128, 256 };
324
325        int kmeansParamSpaceSize = FLANN_ARRAY_LEN(maxIterations) * FLANN_ARRAY_LEN(branchingFactors);
326        costs.reserve(costs.size() + kmeansParamSpaceSize);
327
328        // evaluate kmeans for all parameter combinations
329        for (size_t i = 0; i < FLANN_ARRAY_LEN(maxIterations); ++i) {
330            for (size_t j = 0; j < FLANN_ARRAY_LEN(branchingFactors); ++j) {
331                CostData cost;
332                cost.params["algorithm"] = FLANN_INDEX_KMEANS;
333                cost.params["centers_init"] = FLANN_CENTERS_RANDOM;
334                cost.params["iterations"] = maxIterations[i];
335                cost.params["branching"] = branchingFactors[j];
336
337                evaluate_kmeans(cost);
338                costs.push_back(cost);
339            }
340        }
341
342        //         Logger::info("KMEANS, Step 2: simplex-downhill optimization\n");
343        //
344        //         const int n = 2;
345        //         // choose initial simplex points as the best parameters so far
346        //         int kmeansNMPoints[n*(n+1)];
347        //         float kmeansVals[n+1];
348        //         for (int i=0;i<n+1;++i) {
349        //             kmeansNMPoints[i*n] = (int)kmeansCosts[i].params["branching"];
350        //             kmeansNMPoints[i*n+1] = (int)kmeansCosts[i].params["max-iterations"];
351        //             kmeansVals[i] = kmeansCosts[i].timeCost;
352        //         }
353        //         KMeansSimpleDownhillFunctor kmeans_cost_func(*this);
354        //         // run optimization
355        //         optimizeSimplexDownhill(kmeansNMPoints,n,kmeans_cost_func,kmeansVals);
356        //         // store results
357        //         for (int i=0;i<n+1;++i) {
358        //             kmeansCosts[i].params["branching"] = kmeansNMPoints[i*2];
359        //             kmeansCosts[i].params["max-iterations"] = kmeansNMPoints[i*2+1];
360        //             kmeansCosts[i].timeCost = kmeansVals[i];
361        //         }
362    }
363
364
365    void optimizeKDTree(std::vector<CostData>& costs)
366    {
367        Logger::info("KD-TREE, Step 1: Exploring parameter space\n");
368
369        // explore kd-tree parameters space using the parameters below
370        int testTrees[] = { 1, 4, 8, 16, 32 };
371
372        // evaluate kdtree for all parameter combinations
373        for (size_t i = 0; i < FLANN_ARRAY_LEN(testTrees); ++i) {
374            CostData cost;
375            cost.params["trees"] = testTrees[i];
376
377            evaluate_kdtree(cost);
378            costs.push_back(cost);
379        }
380
381        //         Logger::info("KD-TREE, Step 2: simplex-downhill optimization\n");
382        //
383        //         const int n = 1;
384        //         // choose initial simplex points as the best parameters so far
385        //         int kdtreeNMPoints[n*(n+1)];
386        //         float kdtreeVals[n+1];
387        //         for (int i=0;i<n+1;++i) {
388        //             kdtreeNMPoints[i] = (int)kdtreeCosts[i].params["trees"];
389        //             kdtreeVals[i] = kdtreeCosts[i].timeCost;
390        //         }
391        //         KDTreeSimpleDownhillFunctor kdtree_cost_func(*this);
392        //         // run optimization
393        //         optimizeSimplexDownhill(kdtreeNMPoints,n,kdtree_cost_func,kdtreeVals);
394        //         // store results
395        //         for (int i=0;i<n+1;++i) {
396        //             kdtreeCosts[i].params["trees"] = kdtreeNMPoints[i];
397        //             kdtreeCosts[i].timeCost = kdtreeVals[i];
398        //         }
399    }
400
401    /**
402     *  Chooses the best nearest-neighbor algorithm and estimates the optimal
403     *  parameters to use when building the index (for a given precision).
404     *  Returns a dictionary with the optimal parameters.
405     */
406    IndexParams estimateBuildParams()
407    {
408        std::vector<CostData> costs;
409
410        int sampleSize = int(sample_fraction_ * dataset_.rows);
411        int testSampleSize = std::min(sampleSize / 10, 1000);
412
413        Logger::info("Entering autotuning, dataset size: %d, sampleSize: %d, testSampleSize: %d, target precision: %g\n", dataset_.rows, sampleSize, testSampleSize, target_precision_);
414
415        // For a very small dataset, it makes no sense to build any fancy index, just
416        // use linear search
417        if (testSampleSize < 10) {
418            Logger::info("Choosing linear, dataset too small\n");
419            return LinearIndexParams();
420        }
421
422        // We use a fraction of the original dataset to speedup the autotune algorithm
423        sampledDataset_ = random_sample(dataset_, sampleSize);
424        // We use a cross-validation approach, first we sample a testset from the dataset
425        testDataset_ = random_sample(sampledDataset_, testSampleSize, true);
426
427        // We compute the ground truth using linear search
428        Logger::info("Computing ground truth... \n");
429        gt_matches_ = Matrix<int>(new int[testDataset_.rows], testDataset_.rows, 1);
430        StartStopTimer t;
431        t.start();
432        compute_ground_truth<Distance>(sampledDataset_, testDataset_, gt_matches_, 0, distance_);
433        t.stop();
434
435        CostData linear_cost;
436        linear_cost.searchTimeCost = (float)t.value;
437        linear_cost.buildTimeCost = 0;
438        linear_cost.memoryCost = 0;
439        linear_cost.params["algorithm"] = FLANN_INDEX_LINEAR;
440
441        costs.push_back(linear_cost);
442
443        // Start parameter autotune process
444        Logger::info("Autotuning parameters...\n");
445
446        optimizeKMeans(costs);
447        optimizeKDTree(costs);
448
449        float bestTimeCost = costs[0].searchTimeCost;
450        for (size_t i = 0; i < costs.size(); ++i) {
451            float timeCost = costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost;
452            if (timeCost < bestTimeCost) {
453                bestTimeCost = timeCost;
454            }
455        }
456
457        float bestCost = costs[0].searchTimeCost / bestTimeCost;
458        IndexParams bestParams = costs[0].params;
459        if (bestTimeCost > 0) {
460            for (size_t i = 0; i < costs.size(); ++i) {
461                float crtCost = (costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost) / bestTimeCost +
462                                memory_weight_ * costs[i].memoryCost;
463                if (crtCost < bestCost) {
464                    bestCost = crtCost;
465                    bestParams = costs[i].params;
466                }
467            }
468        }
469
470        delete[] gt_matches_.ptr();
471        delete[] testDataset_.ptr();
472        delete[] sampledDataset_.ptr();
473
474        return bestParams;
475    }
476
477
478
479    /**
480     *  Estimates the search time parameters needed to get the desired precision.
481     *  Precondition: the index is built
482     *  Postcondition: the searchParams will have the optimum params set, also the speedup obtained over linear search.
483     */
484    float estimateSearchParams(SearchParams& searchParams)
485    {
486        const int nn = 1;
487        const size_t SAMPLE_COUNT = 1000;
488
489        assert(bestIndex_ != NULL); // must have a valid index
490
491        float speedup = 0;
492
493        int samples = (int)std::min(dataset_.rows / 10, SAMPLE_COUNT);
494        if (samples > 0) {
495            Matrix<ElementType> testDataset = random_sample(dataset_, samples);
496
497            Logger::info("Computing ground truth\n");
498
499            // we need to compute the ground truth first
500            Matrix<int> gt_matches(new int[testDataset.rows], testDataset.rows, 1);
501            StartStopTimer t;
502            t.start();
503            compute_ground_truth<Distance>(dataset_, testDataset, gt_matches, 1, distance_);
504            t.stop();
505            float linear = (float)t.value;
506
507            int checks;
508            Logger::info("Estimating number of checks\n");
509
510            float searchTime;
511            float cb_index;
512            if (bestIndex_->getType() == FLANN_INDEX_KMEANS) {
513                Logger::info("KMeans algorithm, estimating cluster border factor\n");
514                KMeansIndex<Distance>* kmeans = (KMeansIndex<Distance>*)bestIndex_;
515                float bestSearchTime = -1;
516                float best_cb_index = -1;
517                int best_checks = -1;
518                for (cb_index = 0; cb_index < 1.1f; cb_index += 0.2f) {
519                    kmeans->set_cb_index(cb_index);
520                    searchTime = test_index_precision(*kmeans, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
521                    if ((searchTime < bestSearchTime) || (bestSearchTime == -1)) {
522                        bestSearchTime = searchTime;
523                        best_cb_index = cb_index;
524                        best_checks = checks;
525                    }
526                }
527                searchTime = bestSearchTime;
528                cb_index = best_cb_index;
529                checks = best_checks;
530
531                kmeans->set_cb_index(best_cb_index);
532                Logger::info("Optimum cb_index: %g\n", cb_index);
533                bestParams_["cb_index"] = cb_index;
534            }
535            else {
536                searchTime = test_index_precision(*bestIndex_, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
537            }
538
539            Logger::info("Required number of checks: %d \n", checks);
540            searchParams.checks = checks;
541
542            speedup = linear / searchTime;
543
544            delete[] gt_matches.ptr();
545            delete[] testDataset.ptr();
546        }
547
548        return speedup;
549    }
550
551private:
552    NNIndex<Distance>* bestIndex_;
553
554    IndexParams bestParams_;
555    SearchParams bestSearchParams_;
556
557    Matrix<ElementType> sampledDataset_;
558    Matrix<ElementType> testDataset_;
559    Matrix<int> gt_matches_;
560
561    float speedup_;
562
563    /**
564     * The dataset used by this index
565     */
566    const Matrix<ElementType> dataset_;
567
568    /**
569     * Index parameters
570     */
571    float target_precision_;
572    float build_weight_;
573    float memory_weight_;
574    float sample_fraction_;
575
576    Distance distance_;
577
578
579};
580}
581
582#endif /* FLANN_AUTOTUNED_INDEX_H_ */
Note: See TracBrowser for help on using the repository browser.