From 8fecbed410b91d0904cb6ea83568bd18fa741320 Mon Sep 17 00:00:00 2001 From: Mintpasokon <2518558815@qq.com> Date: Fri, 31 Mar 2023 17:40:55 +0800 Subject: [PATCH] Add concurrent tree build support --- CHANGELOG.md | 1 + README.md | 4 + examples/KDTreeVectorOfVectorsAdaptor.h | 4 +- include/nanoflann.hpp | 172 +++++++++++++++++++++++- tests/test_main.cpp | 118 ++++++++++++++++ 5 files changed, 290 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4a8e7c2..86d92299 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ nanoflann 1.5.0: UNRELEASED ctor parameters. - Added method RadiusResultSet::empty() - Template argument rename: `AccesorType` => `IndexType` (does not actually affect user code at all). + - Added concurrent tree building support, refer to `KDTreeSingleIndexAdaptorParams::n_thread_build`. * **Other changes:** - Macros to avoid conflicts with X11 symbols. - Inline an auxiliary example function in case users want to use it and diff --git a/README.md b/README.md index 442d8fcf..a30bb5fe 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,10 @@ So, it seems that a `leaf_max_size` **between 10 and 50** would be optimum in ap This parameter is really ignored in `nanoflann`, but was kept for backward compatibility with the original FLANN interface. Just ignore it. +### 2.3. `KDTreeSingleIndexAdaptorParams::n_thread_build` + +This parameter determines the maximum number of threads that can be called concurrently during the construction of the KD tree. The default value is 1. When the parameter is set to 0, `nanoflann` automatically determines the number of threads to use. + ----- ## 3. Performance diff --git a/examples/KDTreeVectorOfVectorsAdaptor.h b/examples/KDTreeVectorOfVectorsAdaptor.h index df08c88d..50d16f96 100644 --- a/examples/KDTreeVectorOfVectorsAdaptor.h +++ b/examples/KDTreeVectorOfVectorsAdaptor.h @@ -69,7 +69,7 @@ struct KDTreeVectorOfVectorsAdaptor /// data points KDTreeVectorOfVectorsAdaptor( const size_t /* dimensionality */, const VectorOfVectorsType& mat, - const int leaf_max_size = 10) + const int leaf_max_size = 10, const unsigned int n_thread_build = 1) : m_data(mat) { assert(mat.size() != 0 && mat[0].size() != 0); @@ -80,7 +80,7 @@ struct KDTreeVectorOfVectorsAdaptor "argument"); index = new index_t( static_cast(dims), *this /* adaptor */, - nanoflann::KDTreeSingleIndexAdaptorParams(leaf_max_size)); + nanoflann::KDTreeSingleIndexAdaptorParams(leaf_max_size, KDTreeSingleIndexAdaptorFlags::None, n_thread_build)); } ~KDTreeVectorOfVectorsAdaptor() { delete index; } diff --git a/include/nanoflann.hpp b/include/nanoflann.hpp index 35b9c14a..e5d86487 100644 --- a/include/nanoflann.hpp +++ b/include/nanoflann.hpp @@ -46,10 +46,12 @@ #include #include +#include #include #include // for abs() #include // for abs() #include // std::reference_wrapper +#include #include #include // std::numeric_limits #include @@ -692,14 +694,19 @@ inline std::underlying_type::type operator&( struct KDTreeSingleIndexAdaptorParams { KDTreeSingleIndexAdaptorParams( - size_t _leaf_max_size = 10, KDTreeSingleIndexAdaptorFlags _flags = - KDTreeSingleIndexAdaptorFlags::None) - : leaf_max_size(_leaf_max_size), flags(_flags) + size_t _leaf_max_size = 10, + KDTreeSingleIndexAdaptorFlags _flags = + KDTreeSingleIndexAdaptorFlags::None, + unsigned int _n_thread_build = 1) + : leaf_max_size(_leaf_max_size), + flags(_flags), + n_thread_build(_n_thread_build) { } size_t leaf_max_size; KDTreeSingleIndexAdaptorFlags flags; + unsigned int n_thread_build; }; /** Search options for KDTreeSingleIndexAdaptor::findNeighbors() */ @@ -954,6 +961,8 @@ class KDTreeBaseClass Size leaf_max_size_ = 0; + /// Number of thread for concurrent tree build + Size n_thread_build_ = 1; /// Number of current points in the dataset Size size_ = 0; /// Number of points in the dataset when the index was built @@ -1084,6 +1093,117 @@ class KDTreeBaseClass return node; } + /** + * Create a tree node that subdivides the list of vecs from vind[first] to + * vind[last] concurrently. The routine is called recursively on each + * sublist. + * + * @param left index of the first vector + * @param right index of the last vector + * @param thread_count count of std::async threads + * @param mutex mutex for mempool allocation + */ + NodePtr divideTreeConcurrent( + Derived& obj, const Offset left, const Offset right, BoundingBox& bbox, + std::atomic& thread_count, std::mutex& mutex) + { + std::unique_lock lock(mutex); + NodePtr node = obj.pool_.template allocate(); // allocate memory + lock.unlock(); + + const auto dims = (DIM > 0 ? DIM : obj.dim_); + + /* If too few exemplars remain, then make this a leaf node. */ + if ((right - left) <= static_cast(obj.leaf_max_size_)) + { + node->child1 = node->child2 = nullptr; /* Mark as leaf node. */ + node->node_type.lr.left = left; + node->node_type.lr.right = right; + + // compute bounding-box of leaf points + for (Dimension i = 0; i < dims; ++i) + { + bbox[i].low = dataset_get(obj, obj.vAcc_[left], i); + bbox[i].high = dataset_get(obj, obj.vAcc_[left], i); + } + for (Offset k = left + 1; k < right; ++k) + { + for (Dimension i = 0; i < dims; ++i) + { + const auto val = dataset_get(obj, obj.vAcc_[k], i); + if (bbox[i].low > val) bbox[i].low = val; + if (bbox[i].high < val) bbox[i].high = val; + } + } + } + else + { + Offset idx; + Dimension cutfeat; + DistanceType cutval; + middleSplit_(obj, left, right - left, idx, cutfeat, cutval, bbox); + + node->node_type.sub.divfeat = cutfeat; + + std::future left_future, right_future; + + BoundingBox left_bbox(bbox); + left_bbox[cutfeat].high = cutval; + if (++thread_count < n_thread_build_) + { + left_future = std::async( + std::launch::async, &KDTreeBaseClass::divideTreeConcurrent, + this, std::ref(obj), left, left + idx, std::ref(left_bbox), + std::ref(thread_count), std::ref(mutex)); + } + else + { + --thread_count; + node->child1 = this->divideTreeConcurrent( + obj, left, left + idx, left_bbox, thread_count, mutex); + } + + BoundingBox right_bbox(bbox); + right_bbox[cutfeat].low = cutval; + if (++thread_count < n_thread_build_) + { + right_future = std::async( + std::launch::async, &KDTreeBaseClass::divideTreeConcurrent, + this, std::ref(obj), left + idx, right, + std::ref(right_bbox), std::ref(thread_count), + std::ref(mutex)); + } + else + { + --thread_count; + node->child2 = this->divideTreeConcurrent( + obj, left + idx, right, right_bbox, thread_count, mutex); + } + + if (left_future.valid()) + { + node->child1 = left_future.get(); + --thread_count; + } + if (right_future.valid()) + { + node->child2 = right_future.get(); + --thread_count; + } + + node->node_type.sub.divlow = left_bbox[cutfeat].high; + node->node_type.sub.divhigh = right_bbox[cutfeat].low; + + for (Dimension i = 0; i < dims; ++i) + { + bbox[i].low = std::min(left_bbox[i].low, right_bbox[i].low); + bbox[i].high = std::max(left_bbox[i].high, right_bbox[i].high); + } + } + + return node; + } + void middleSplit_( const Derived& obj, const Offset ind, const Size count, Offset& index, Dimension& cutfeat, DistanceType& cutval, const BoundingBox& bbox) @@ -1397,6 +1517,15 @@ class KDTreeSingleIndexAdaptor Base::dim_ = dimensionality; if (DIM > 0) Base::dim_ = DIM; Base::leaf_max_size_ = params.leaf_max_size; + if (params.n_thread_build > 0) + { + Base::n_thread_build_ = params.n_thread_build; + } + else + { + Base::n_thread_build_ = + std::max(std::thread::hardware_concurrency(), 1u); + } if (!(params.flags & KDTreeSingleIndexAdaptorFlags::SkipInitialBuildIndex)) @@ -1420,8 +1549,18 @@ class KDTreeSingleIndexAdaptor if (Base::size_ == 0) return; computeBoundingBox(Base::root_bbox_); // construct the tree - Base::root_node_ = - this->divideTree(*this, 0, Base::size_, Base::root_bbox_); + if (Base::n_thread_build_ == 1) + { + Base::root_node_ = + this->divideTree(*this, 0, Base::size_, Base::root_bbox_); + } + else + { + std::atomic thread_count = 0; + std::mutex mutex; + Base::root_node_ = this->divideTreeConcurrent( + *this, 0, Base::size_, Base::root_bbox_, thread_count, mutex); + } } /** \name Query methods @@ -1803,6 +1942,15 @@ class KDTreeSingleIndexDynamicAdaptor_ Base::dim_ = dimensionality; if (DIM > 0) Base::dim_ = DIM; Base::leaf_max_size_ = params.leaf_max_size; + if (params.n_thread_build > 0) + { + Base::n_thread_build_ = params.n_thread_build; + } + else + { + Base::n_thread_build_ = + std::max(std::thread::hardware_concurrency(), 1u); + } } /** Explicitly default the copy constructor */ @@ -1837,8 +1985,18 @@ class KDTreeSingleIndexDynamicAdaptor_ if (Base::size_ == 0) return; computeBoundingBox(Base::root_bbox_); // construct the tree - Base::root_node_ = - this->divideTree(*this, 0, Base::size_, Base::root_bbox_); + if (Base::n_thread_build_ == 1) + { + Base::root_node_ = + this->divideTree(*this, 0, Base::size_, Base::root_bbox_); + } + else + { + std::atomic thread_count = 0; + std::mutex mutex; + Base::root_node_ = this->divideTreeConcurrent( + *this, 0, Base::size_, Base::root_bbox_, thread_count, mutex); + } } /** \name Query methods diff --git a/tests/test_main.cpp b/tests/test_main.cpp index bebad6e6..8410fcfd 100644 --- a/tests/test_main.cpp +++ b/tests/test_main.cpp @@ -384,6 +384,94 @@ void L2_dynamic_vs_bruteforce_test(const size_t nSamples) } } +template +void L2_concurrent_build_vs_bruteforce_test(const size_t nSamples, + const size_t DIM) +{ + std::vector> samples; + + const NUM max_range = NUM(20.0); + + // Generate points: + generateRandomPointCloud(samples, nSamples, DIM, max_range); + + // Query point: + std::vector query_pt(DIM); + for (size_t d = 0; d < DIM; d++) + query_pt[d] = static_cast(max_range * (rand() % 1000) / (1000.0)); + + // construct a kd-tree index: + // Dimensionality set at run-time (default: L2) + // ------------------------------------------------------------ + typedef KDTreeVectorOfVectorsAdaptor>, NUM> + my_kd_tree_t; + + my_kd_tree_t mat_index(DIM /*dim*/, samples, 10 /* max leaf */, 0 /* concurrent build */); + + // do a knn search + const size_t num_results = 1; + std::vector ret_indexes(num_results); + std::vector out_dists_sqr(num_results); + + nanoflann::KNNResultSet resultSet(num_results); + + resultSet.init(&ret_indexes[0], &out_dists_sqr[0]); + mat_index.index->findNeighbors(resultSet, &query_pt[0]); + + // Brute force: + double min_dist_L2 = std::numeric_limits::max(); + size_t min_idx = std::numeric_limits::max(); + { + for (size_t i = 0; i < nSamples; i++) + { + double dist = 0.0; + for (size_t d = 0; d < DIM; d++) + dist += (query_pt[d] - samples[i][d]) * + (query_pt[d] - samples[i][d]); + if (dist < min_dist_L2) + { + min_dist_L2 = dist; + min_idx = i; + } + } + ASSERT_TRUE(min_idx != std::numeric_limits::max()); + } + + // Compare: + EXPECT_EQ(min_idx, ret_indexes[0]); + EXPECT_NEAR(min_dist_L2, out_dists_sqr[0], 1e-3); +} + +template +void L2_concurrent_build_vs_L2_test(const size_t nSamples, + const size_t DIM) +{ + std::vector> samples; + + const NUM max_range = NUM(20.0); + + // Generate points: + generateRandomPointCloud(samples, nSamples, DIM, max_range); + + // Query point: + std::vector query_pt(DIM); + for (size_t d = 0; d < DIM; d++) + query_pt[d] = static_cast(max_range * (rand() % 1000) / (1000.0)); + + // construct a kd-tree index: + // Dimensionality set at run-time (default: L2) + // ------------------------------------------------------------ + typedef KDTreeVectorOfVectorsAdaptor>, NUM> + my_kd_tree_t; + + my_kd_tree_t mat_index_concurrent_build(DIM /*dim*/, samples, + 10 /* max leaf */, 0 /* concurrent build */); + my_kd_tree_t mat_index(DIM /*dim*/, samples, 10 /* max leaf */); + + // Compare: + EXPECT_EQ(mat_index.index->vAcc_, mat_index_concurrent_build.index->vAcc_); +} + TEST(kdtree, L2_vs_L2_simple) { for (int nResults = 1; nResults < 10; nResults++) @@ -552,3 +640,33 @@ TEST(kdtree, add_and_remove_points) actual = query(); EXPECT_EQ(actual, static_cast(0)); } + +TEST(kdtree, L2_concurrent_build_vs_bruteforce) +{ + srand(static_cast(time(nullptr))); + for (int i = 0; i < 10; i++) + { + L2_concurrent_build_vs_bruteforce_test(100, 2); + L2_concurrent_build_vs_bruteforce_test(100, 3); + L2_concurrent_build_vs_bruteforce_test(100, 7); + + L2_concurrent_build_vs_bruteforce_test(100, 2); + L2_concurrent_build_vs_bruteforce_test(100, 3); + L2_concurrent_build_vs_bruteforce_test(100, 7); + } +} + +TEST(kdtree, L2_concurrent_build_vs_L2) +{ + srand(static_cast(time(nullptr))); + for (int i = 0; i < 10; i++) + { + L2_concurrent_build_vs_L2_test(100, 2); + L2_concurrent_build_vs_L2_test(100, 3); + L2_concurrent_build_vs_L2_test(100, 7); + + L2_concurrent_build_vs_L2_test(100, 2); + L2_concurrent_build_vs_L2_test(100, 3); + L2_concurrent_build_vs_L2_test(100, 7); + } +}