From 8fecbed410b91d0904cb6ea83568bd18fa741320 Mon Sep 17 00:00:00 2001
From: Mintpasokon <2518558815@qq.com>
Date: Fri, 31 Mar 2023 17:40:55 +0800
Subject: [PATCH] Add concurrent tree build support

---
 CHANGELOG.md                            |   1 +
 README.md                               |   4 +
 examples/KDTreeVectorOfVectorsAdaptor.h |   4 +-
 include/nanoflann.hpp                   | 172 +++++++++++++++++++++++-
 tests/test_main.cpp                     | 118 ++++++++++++++++
 5 files changed, 290 insertions(+), 9 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c4a8e7c2..86d92299 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ nanoflann 1.5.0: UNRELEASED
      ctor parameters.
    - Added method RadiusResultSet::empty()
    - Template argument rename: `AccesorType` => `IndexType` (does not actually affect user code at all).
+   - Added concurrent tree building support, refer to `KDTreeSingleIndexAdaptorParams::n_thread_build`.
  * **Other changes:**
    - Macros to avoid conflicts with X11 symbols.
    - Inline an auxiliary example function in case users want to use it and
diff --git a/README.md b/README.md
index 442d8fcf..a30bb5fe 100644
--- a/README.md
+++ b/README.md
@@ -189,6 +189,10 @@ So, it seems that a `leaf_max_size` **between 10 and 50** would be optimum in ap
 
 This parameter is really ignored in `nanoflann`, but was kept for backward compatibility with the original FLANN interface. Just ignore it.
 
+### 2.3. `KDTreeSingleIndexAdaptorParams::n_thread_build`
+
+This parameter determines the maximum number of threads that can be called concurrently during the construction of the KD tree. The default value is 1. When the parameter is set to 0, `nanoflann` automatically determines the number of threads to use.
+
 -----
 
 ## 3. Performance
diff --git a/examples/KDTreeVectorOfVectorsAdaptor.h b/examples/KDTreeVectorOfVectorsAdaptor.h
index df08c88d..50d16f96 100644
--- a/examples/KDTreeVectorOfVectorsAdaptor.h
+++ b/examples/KDTreeVectorOfVectorsAdaptor.h
@@ -69,7 +69,7 @@ struct KDTreeVectorOfVectorsAdaptor
     /// data points
     KDTreeVectorOfVectorsAdaptor(
         const size_t /* dimensionality */, const VectorOfVectorsType& mat,
-        const int leaf_max_size = 10)
+        const int leaf_max_size = 10, const unsigned int n_thread_build = 1)
         : m_data(mat)
     {
         assert(mat.size() != 0 && mat[0].size() != 0);
@@ -80,7 +80,7 @@ struct KDTreeVectorOfVectorsAdaptor
                 "argument");
         index = new index_t(
             static_cast<int>(dims), *this /* adaptor */,
-            nanoflann::KDTreeSingleIndexAdaptorParams(leaf_max_size));
+            nanoflann::KDTreeSingleIndexAdaptorParams(leaf_max_size, KDTreeSingleIndexAdaptorFlags::None, n_thread_build));
     }
 
     ~KDTreeVectorOfVectorsAdaptor() { delete index; }
diff --git a/include/nanoflann.hpp b/include/nanoflann.hpp
index 35b9c14a..e5d86487 100644
--- a/include/nanoflann.hpp
+++ b/include/nanoflann.hpp
@@ -46,10 +46,12 @@
 
 #include <algorithm>
 #include <array>
+#include <atomic>
 #include <cassert>
 #include <cmath>  // for abs()
 #include <cstdlib>  // for abs()
 #include <functional>  // std::reference_wrapper
+#include <future>
 #include <istream>
 #include <limits>  // std::numeric_limits
 #include <ostream>
@@ -692,14 +694,19 @@ inline std::underlying_type<KDTreeSingleIndexAdaptorFlags>::type operator&(
 struct KDTreeSingleIndexAdaptorParams
 {
     KDTreeSingleIndexAdaptorParams(
-        size_t _leaf_max_size = 10, KDTreeSingleIndexAdaptorFlags _flags =
-                                        KDTreeSingleIndexAdaptorFlags::None)
-        : leaf_max_size(_leaf_max_size), flags(_flags)
+        size_t                        _leaf_max_size = 10,
+        KDTreeSingleIndexAdaptorFlags _flags =
+            KDTreeSingleIndexAdaptorFlags::None,
+        unsigned int _n_thread_build = 1)
+        : leaf_max_size(_leaf_max_size),
+          flags(_flags),
+          n_thread_build(_n_thread_build)
     {
     }
 
     size_t                        leaf_max_size;
     KDTreeSingleIndexAdaptorFlags flags;
+    unsigned int                  n_thread_build;
 };
 
 /** Search options for KDTreeSingleIndexAdaptor::findNeighbors() */
@@ -954,6 +961,8 @@ class KDTreeBaseClass
 
     Size leaf_max_size_ = 0;
 
+    /// Number of thread for concurrent tree build
+    Size n_thread_build_ = 1;
     /// Number of current points in the dataset
     Size size_ = 0;
     /// Number of points in the dataset when the index was built
@@ -1084,6 +1093,117 @@ class KDTreeBaseClass
         return node;
     }
 
+    /**
+     * Create a tree node that subdivides the list of vecs from vind[first] to
+     * vind[last] concurrently.  The routine is called recursively on each
+     * sublist.
+     *
+     * @param left index of the first vector
+     * @param right index of the last vector
+     * @param thread_count count of std::async threads
+     * @param mutex mutex for mempool allocation
+     */
+    NodePtr divideTreeConcurrent(
+        Derived& obj, const Offset left, const Offset right, BoundingBox& bbox,
+        std::atomic<unsigned int>& thread_count, std::mutex& mutex)
+    {
+        std::unique_lock<std::mutex> lock(mutex);
+        NodePtr node = obj.pool_.template allocate<Node>();  // allocate memory
+        lock.unlock();
+
+        const auto dims = (DIM > 0 ? DIM : obj.dim_);
+
+        /* If too few exemplars remain, then make this a leaf node. */
+        if ((right - left) <= static_cast<Offset>(obj.leaf_max_size_))
+        {
+            node->child1 = node->child2 = nullptr; /* Mark as leaf node. */
+            node->node_type.lr.left     = left;
+            node->node_type.lr.right    = right;
+
+            // compute bounding-box of leaf points
+            for (Dimension i = 0; i < dims; ++i)
+            {
+                bbox[i].low  = dataset_get(obj, obj.vAcc_[left], i);
+                bbox[i].high = dataset_get(obj, obj.vAcc_[left], i);
+            }
+            for (Offset k = left + 1; k < right; ++k)
+            {
+                for (Dimension i = 0; i < dims; ++i)
+                {
+                    const auto val = dataset_get(obj, obj.vAcc_[k], i);
+                    if (bbox[i].low > val) bbox[i].low = val;
+                    if (bbox[i].high < val) bbox[i].high = val;
+                }
+            }
+        }
+        else
+        {
+            Offset       idx;
+            Dimension    cutfeat;
+            DistanceType cutval;
+            middleSplit_(obj, left, right - left, idx, cutfeat, cutval, bbox);
+
+            node->node_type.sub.divfeat = cutfeat;
+
+            std::future<NodePtr> left_future, right_future;
+
+            BoundingBox left_bbox(bbox);
+            left_bbox[cutfeat].high = cutval;
+            if (++thread_count < n_thread_build_)
+            {
+                left_future = std::async(
+                    std::launch::async, &KDTreeBaseClass::divideTreeConcurrent,
+                    this, std::ref(obj), left, left + idx, std::ref(left_bbox),
+                    std::ref(thread_count), std::ref(mutex));
+            }
+            else
+            {
+                --thread_count;
+                node->child1 = this->divideTreeConcurrent(
+                    obj, left, left + idx, left_bbox, thread_count, mutex);
+            }
+
+            BoundingBox right_bbox(bbox);
+            right_bbox[cutfeat].low = cutval;
+            if (++thread_count < n_thread_build_)
+            {
+                right_future = std::async(
+                    std::launch::async, &KDTreeBaseClass::divideTreeConcurrent,
+                    this, std::ref(obj), left + idx, right,
+                    std::ref(right_bbox), std::ref(thread_count),
+                    std::ref(mutex));
+            }
+            else
+            {
+                --thread_count;
+                node->child2 = this->divideTreeConcurrent(
+                    obj, left + idx, right, right_bbox, thread_count, mutex);
+            }
+
+            if (left_future.valid())
+            {
+                node->child1 = left_future.get();
+                --thread_count;
+            }
+            if (right_future.valid())
+            {
+                node->child2 = right_future.get();
+                --thread_count;
+            }
+
+            node->node_type.sub.divlow  = left_bbox[cutfeat].high;
+            node->node_type.sub.divhigh = right_bbox[cutfeat].low;
+
+            for (Dimension i = 0; i < dims; ++i)
+            {
+                bbox[i].low  = std::min(left_bbox[i].low, right_bbox[i].low);
+                bbox[i].high = std::max(left_bbox[i].high, right_bbox[i].high);
+            }
+        }
+
+        return node;
+    }
+
     void middleSplit_(
         const Derived& obj, const Offset ind, const Size count, Offset& index,
         Dimension& cutfeat, DistanceType& cutval, const BoundingBox& bbox)
@@ -1397,6 +1517,15 @@ class KDTreeSingleIndexAdaptor
         Base::dim_                 = dimensionality;
         if (DIM > 0) Base::dim_ = DIM;
         Base::leaf_max_size_ = params.leaf_max_size;
+        if (params.n_thread_build > 0)
+        {
+            Base::n_thread_build_ = params.n_thread_build;
+        }
+        else
+        {
+            Base::n_thread_build_ =
+                std::max(std::thread::hardware_concurrency(), 1u);
+        }
 
         if (!(params.flags &
               KDTreeSingleIndexAdaptorFlags::SkipInitialBuildIndex))
@@ -1420,8 +1549,18 @@ class KDTreeSingleIndexAdaptor
         if (Base::size_ == 0) return;
         computeBoundingBox(Base::root_bbox_);
         // construct the tree
-        Base::root_node_ =
-            this->divideTree(*this, 0, Base::size_, Base::root_bbox_);
+        if (Base::n_thread_build_ == 1)
+        {
+            Base::root_node_ =
+                this->divideTree(*this, 0, Base::size_, Base::root_bbox_);
+        }
+        else
+        {
+            std::atomic<unsigned int> thread_count = 0;
+            std::mutex                mutex;
+            Base::root_node_ = this->divideTreeConcurrent(
+                *this, 0, Base::size_, Base::root_bbox_, thread_count, mutex);
+        }
     }
 
     /** \name Query methods
@@ -1803,6 +1942,15 @@ class KDTreeSingleIndexDynamicAdaptor_
         Base::dim_ = dimensionality;
         if (DIM > 0) Base::dim_ = DIM;
         Base::leaf_max_size_ = params.leaf_max_size;
+        if (params.n_thread_build > 0)
+        {
+            Base::n_thread_build_ = params.n_thread_build;
+        }
+        else
+        {
+            Base::n_thread_build_ =
+                std::max(std::thread::hardware_concurrency(), 1u);
+        }
     }
 
     /** Explicitly default the copy constructor */
@@ -1837,8 +1985,18 @@ class KDTreeSingleIndexDynamicAdaptor_
         if (Base::size_ == 0) return;
         computeBoundingBox(Base::root_bbox_);
         // construct the tree
-        Base::root_node_ =
-            this->divideTree(*this, 0, Base::size_, Base::root_bbox_);
+        if (Base::n_thread_build_ == 1)
+        {
+            Base::root_node_ =
+                this->divideTree(*this, 0, Base::size_, Base::root_bbox_);
+        }
+        else
+        {
+            std::atomic<unsigned int> thread_count = 0;
+            std::mutex                mutex;
+            Base::root_node_ = this->divideTreeConcurrent(
+                *this, 0, Base::size_, Base::root_bbox_, thread_count, mutex);
+        }
     }
 
     /** \name Query methods
diff --git a/tests/test_main.cpp b/tests/test_main.cpp
index bebad6e6..8410fcfd 100644
--- a/tests/test_main.cpp
+++ b/tests/test_main.cpp
@@ -384,6 +384,94 @@ void L2_dynamic_vs_bruteforce_test(const size_t nSamples)
     }
 }
 
+template <typename NUM>
+void L2_concurrent_build_vs_bruteforce_test(const size_t nSamples, 
+    const size_t DIM)
+{
+    std::vector<std::vector<NUM>> samples;
+
+    const NUM max_range = NUM(20.0);
+
+    // Generate points:
+    generateRandomPointCloud(samples, nSamples, DIM, max_range);
+
+    // Query point:
+    std::vector<NUM> query_pt(DIM);
+    for (size_t d = 0; d < DIM; d++)
+        query_pt[d] = static_cast<NUM>(max_range * (rand() % 1000) / (1000.0));
+
+    // construct a kd-tree index:
+    // Dimensionality set at run-time (default: L2)
+    // ------------------------------------------------------------
+    typedef KDTreeVectorOfVectorsAdaptor<std::vector<std::vector<NUM>>, NUM>
+        my_kd_tree_t;
+
+    my_kd_tree_t mat_index(DIM /*dim*/, samples, 10 /* max leaf */, 0 /* concurrent build */);
+
+    // do a knn search
+    const size_t        num_results = 1;
+    std::vector<size_t> ret_indexes(num_results);
+    std::vector<NUM>    out_dists_sqr(num_results);
+
+    nanoflann::KNNResultSet<NUM> resultSet(num_results);
+
+    resultSet.init(&ret_indexes[0], &out_dists_sqr[0]);
+    mat_index.index->findNeighbors(resultSet, &query_pt[0]);
+
+    // Brute force:
+    double min_dist_L2 = std::numeric_limits<double>::max();
+    size_t min_idx     = std::numeric_limits<size_t>::max();
+    {
+        for (size_t i = 0; i < nSamples; i++)
+        {
+            double dist = 0.0;
+            for (size_t d = 0; d < DIM; d++)
+                dist += (query_pt[d] - samples[i][d]) *
+                        (query_pt[d] - samples[i][d]);
+            if (dist < min_dist_L2)
+            {
+                min_dist_L2 = dist;
+                min_idx     = i;
+            }
+        }
+        ASSERT_TRUE(min_idx != std::numeric_limits<size_t>::max());
+    }
+
+    // Compare:
+    EXPECT_EQ(min_idx, ret_indexes[0]);
+    EXPECT_NEAR(min_dist_L2, out_dists_sqr[0], 1e-3);
+}
+
+template <typename NUM>
+void L2_concurrent_build_vs_L2_test(const size_t nSamples, 
+    const size_t DIM)
+{
+    std::vector<std::vector<NUM>> samples;
+
+    const NUM max_range = NUM(20.0);
+
+    // Generate points:
+    generateRandomPointCloud(samples, nSamples, DIM, max_range);
+
+    // Query point:
+    std::vector<NUM> query_pt(DIM);
+    for (size_t d = 0; d < DIM; d++)
+        query_pt[d] = static_cast<NUM>(max_range * (rand() % 1000) / (1000.0));
+
+    // construct a kd-tree index:
+    // Dimensionality set at run-time (default: L2)
+    // ------------------------------------------------------------
+    typedef KDTreeVectorOfVectorsAdaptor<std::vector<std::vector<NUM>>, NUM>
+        my_kd_tree_t;
+
+    my_kd_tree_t mat_index_concurrent_build(DIM /*dim*/, samples, 
+        10 /* max leaf */, 0 /* concurrent build */);
+    my_kd_tree_t mat_index(DIM /*dim*/, samples, 10 /* max leaf */);
+
+    // Compare:
+    EXPECT_EQ(mat_index.index->vAcc_, mat_index_concurrent_build.index->vAcc_);
+}
+
 TEST(kdtree, L2_vs_L2_simple)
 {
     for (int nResults = 1; nResults < 10; nResults++)
@@ -552,3 +640,33 @@ TEST(kdtree, add_and_remove_points)
     actual = query();
     EXPECT_EQ(actual, static_cast<size_t>(0));
 }
+
+TEST(kdtree, L2_concurrent_build_vs_bruteforce)
+{
+    srand(static_cast<unsigned int>(time(nullptr)));
+    for (int i = 0; i < 10; i++)
+    {
+        L2_concurrent_build_vs_bruteforce_test<float>(100, 2);
+        L2_concurrent_build_vs_bruteforce_test<float>(100, 3);
+        L2_concurrent_build_vs_bruteforce_test<float>(100, 7);
+
+        L2_concurrent_build_vs_bruteforce_test<double>(100, 2);
+        L2_concurrent_build_vs_bruteforce_test<double>(100, 3);
+        L2_concurrent_build_vs_bruteforce_test<double>(100, 7);
+    }
+}
+
+TEST(kdtree, L2_concurrent_build_vs_L2)
+{
+    srand(static_cast<unsigned int>(time(nullptr)));
+    for (int i = 0; i < 10; i++)
+    {
+        L2_concurrent_build_vs_L2_test<float>(100, 2);
+        L2_concurrent_build_vs_L2_test<float>(100, 3);
+        L2_concurrent_build_vs_L2_test<float>(100, 7);
+
+        L2_concurrent_build_vs_L2_test<double>(100, 2);
+        L2_concurrent_build_vs_L2_test<double>(100, 3);
+        L2_concurrent_build_vs_L2_test<double>(100, 7);
+    }
+}