From 35287df0f3ba4cf58a277f33feb2b5cab1d834e4 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 15 Apr 2026 19:54:50 -0700 Subject: [PATCH 01/35] Create nvforest::handle_t --- cpp/include/nvforest/forest_model.hpp | 14 +++---- cpp/include/nvforest/handle.hpp | 60 +++++++++++++++++++++++++++ cpp/tests/treelite_importer.cpp | 5 ++- 3 files changed, 70 insertions(+), 9 deletions(-) create mode 100644 cpp/include/nvforest/handle.hpp diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 2b1e6e5..eecda34 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -152,8 +152,8 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t - * on GPU) which will be used to provide streams for evaluation. + * @param[in] handle The nvforest::handle_t which will be used to provide + * streams for evaluation. * @param[out] output The buffer where model output should be stored. If * this buffer is on host while the model is on device or vice versa, * work will be distributed across available streams to copy the data back @@ -177,7 +177,7 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(raft_proto::handle_t const& handle, + void predict(handle_t const& handle, raft_proto::buffer& output, raft_proto::buffer const& input, infer_kind predict_type = infer_kind::default_kind, @@ -252,8 +252,8 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t - * on GPU) which will be used to provide streams for evaluation. + * @param[in] handle The nvforest::handle_t which will be used to provide + * streams for evaluation. * @param[out] output Pointer to the memory location where output should end * up * @param[in] input Pointer to the input data @@ -276,7 +276,7 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(raft_proto::handle_t const& handle, + void predict(handle_t const& handle, io_t* output, io_t* input, std::size_t num_rows, diff --git a/cpp/include/nvforest/handle.hpp b/cpp/include/nvforest/handle.hpp new file mode 100644 index 0000000..5ea4f3b --- /dev/null +++ b/cpp/include/nvforest/handle.hpp @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +#include + +#ifdef NVFOREST_ENABLE_GPU +#include + +#include +#endif + +namespace nvforest { + +#ifdef NVFOREST_ENABLE_GPU +/** + * A thin wrapper around raft_proto::handle_t that owns the underlying raft::handle_t. + * + * Default construction automatically creates both a raft::handle_t and the + * raft_proto::handle_t that references it, so callers do not need to manage + * RAFT handles directly. + */ +struct handle_t { + /** Default constructor: creates and owns a raft::handle_t and wraps it */ + handle_t() + : owned_raft_handle_{std::make_unique()}, + raft_proto_handle_{*owned_raft_handle_} + { + } + + /** Wrap an externally-owned raft::handle_t without taking ownership */ + handle_t(raft::handle_t const& raft_handle) : raft_proto_handle_{raft_handle} {} + + auto get_next_usable_stream() const { return raft_proto_handle_.get_next_usable_stream(); } + auto get_stream_pool_size() const { return raft_proto_handle_.get_stream_pool_size(); } + auto get_usable_stream_count() const { return raft_proto_handle_.get_usable_stream_count(); } + void synchronize() const { raft_proto_handle_.synchronize(); } + + private: + // Null when wrapping an external raft::handle_t + std::unique_ptr owned_raft_handle_; + raft_proto::handle_t raft_proto_handle_; +}; +#else +/** + * CPU-only handle: thin wrapper around the no-op raft_proto::handle_t. + */ +struct handle_t { + auto get_next_usable_stream() const { return raft_proto_handle_.get_next_usable_stream(); } + auto get_stream_pool_size() const { return raft_proto_handle_.get_stream_pool_size(); } + auto get_usable_stream_count() const { return raft_proto_handle_.get_usable_stream_count(); } + void synchronize() const { raft_proto_handle_.synchronize(); } + + private: + raft_proto::handle_t raft_proto_handle_; +}; +#endif + +} // namespace nvforest diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp index 8f946e4..fcd8ad2 100644 --- a/cpp/tests/treelite_importer.cpp +++ b/cpp/tests/treelite_importer.cpp @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -337,7 +338,7 @@ TEST(TreeliteImporter, DegenerateTree) auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); ASSERT_FALSE(fil_model.has_vector_leaves()); - auto handle = raft::handle_t{}; + auto handle = nvforest::handle_t{}; auto X = std::vector{0.0}; auto preds = std::vector(1, 0.0); auto expected_preds = std::vector{1.0}; @@ -358,7 +359,7 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); ASSERT_TRUE(fil_model.has_vector_leaves()); - auto handle = raft::handle_t{}; + auto handle = nvforest::handle_t{}; auto X = std::vector{0.0}; auto preds = std::vector(2, 0.0); auto expected_preds = std::vector{0.5, 0.5}; From 551bbd23da2f24b7b7097bd3a0bbbe5580ac8823 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 15 Apr 2026 20:00:56 -0700 Subject: [PATCH 02/35] Update doc --- docs/source/getting_started.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 8500861..2522f7d 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -195,11 +195,9 @@ Now that the tree model is fully imported into nvForest, let's run inference: .. code-block:: cpp - #include - #include + #include - raft::handle_t raft_handle{}; - raft_proto::handle_t handle{raft_handle}; + auto handle = nvforest::handle_t{}; // Assumption: // * Both output and input are in the GPU memory. From d8ad61bf2339e78196f00cb168244c52313d6967 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 15 Apr 2026 20:45:27 -0700 Subject: [PATCH 03/35] Define nvForest handle in Python layer --- python/nvforest/CMakeLists.txt | 6 ++++ python/nvforest/nvforest/_handle.py | 5 ++-- .../nvforest/detail/forest_inference.pyx | 28 ++++++++----------- .../detail/{raft_proto => }/handle.pxd | 9 +++--- python/nvforest/nvforest/detail/handle.pyx | 17 +++++++++++ 5 files changed, 42 insertions(+), 23 deletions(-) rename python/nvforest/nvforest/detail/{raft_proto => }/handle.pxd (58%) create mode 100644 python/nvforest/nvforest/detail/handle.pyx diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt index fd548e5..dfe234d 100644 --- a/python/nvforest/CMakeLists.txt +++ b/python/nvforest/CMakeLists.txt @@ -72,6 +72,12 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_) +rapids_cython_create_modules( + CXX + SOURCE_FILES "nvforest/detail/handle.pyx" + LINKED_LIBRARIES "${linked_libraries}" + INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_) + rapids_cython_create_modules( CXX SOURCE_FILES "nvforest/detail/treelite.pyx" diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py index 347b748..8b68338 100644 --- a/python/nvforest/nvforest/_handle.py +++ b/python/nvforest/nvforest/_handle.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 # -from pylibraft.common.handle import Handle as RaftHandle +from nvforest.detail.handle import Handle as _Handle -# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.Handle -Handle = RaftHandle +Handle = _Handle diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx index 0340c59..dec4e22 100644 --- a/python/nvforest/nvforest/detail/forest_inference.pyx +++ b/python/nvforest/nvforest/detail/forest_inference.pyx @@ -12,10 +12,11 @@ from nvforest._handle import Handle from nvforest._typing import DataType from nvforest.detail.treelite import safe_treelite_call +from cython.operator cimport dereference as deref from libc.stdint cimport uint32_t, uintptr_t from libcpp cimport bool -from pylibraft.common.handle cimport handle_t as raft_handle_t +from nvforest.detail.handle cimport handle_t from nvforest.detail.infer_kind cimport infer_kind from nvforest.detail.postprocessing cimport element_op, row_op from nvforest.detail.raft_proto.cuda_stream cimport ( @@ -24,7 +25,6 @@ from nvforest.detail.raft_proto.cuda_stream cimport ( from nvforest.detail.raft_proto.device_type cimport ( device_type as raft_proto_device_t, ) -from nvforest.detail.raft_proto.handle cimport handle_t as raft_proto_handle_t from nvforest.detail.raft_proto.optional cimport nullopt, optional from nvforest.detail.tree_layout cimport tree_layout as nvforest_tree_layout from nvforest.detail.treelite cimport ( @@ -37,7 +37,7 @@ from nvforest.detail.treelite cimport ( cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil: cdef cppclass forest_model: void predict[io_t]( - const raft_proto_handle_t&, + const handle_t&, io_t*, io_t*, size_t, @@ -69,13 +69,13 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil: cdef class ForestInference_impl(): cdef forest_model model - cdef raft_proto_handle_t raft_proto_handle - cdef object raft_handle + cdef object py_handle + cdef handle_t* c_handle cdef object device def __cinit__( self, - raft_handle: object, + handle: object, tl_model_bytes: Union[bytes, bytearray], *, layout: str = "depth_first", @@ -84,12 +84,8 @@ cdef class ForestInference_impl(): device: str = "cpu", device_id: Optional[int] = None, ): - # Store reference to RAFT handle to control lifetime, since raft_proto - # handle keeps a pointer to it - self.raft_handle = raft_handle - self.raft_proto_handle = raft_proto_handle_t( - self.raft_handle.getHandle() - ) + self.py_handle = handle + self.c_handle = self.py_handle.getHandle() cdef optional[bool] use_double_precision_c cdef bool use_double_precision_bool @@ -134,7 +130,7 @@ cdef class ForestInference_impl(): use_double_precision_c, dev_type, device_id, - self.raft_proto_handle.get_next_usable_stream() + self.c_handle.get_next_usable_stream() ) safe_treelite_call( @@ -244,7 +240,7 @@ cdef class ForestInference_impl(): if model_dtype == np.float32: self.model.predict[float]( - self.raft_proto_handle, + deref(self.c_handle), out_ptr, in_ptr, n_rows, @@ -255,7 +251,7 @@ cdef class ForestInference_impl(): ) else: self.model.predict[double]( - self.raft_proto_handle, + deref(self.c_handle), out_ptr, in_ptr, n_rows, @@ -266,7 +262,7 @@ cdef class ForestInference_impl(): ) if self.device == "gpu": - self.raft_proto_handle.synchronize() + self.c_handle.synchronize() return preds diff --git a/python/nvforest/nvforest/detail/raft_proto/handle.pxd b/python/nvforest/nvforest/detail/handle.pxd similarity index 58% rename from python/nvforest/nvforest/detail/raft_proto/handle.pxd rename to python/nvforest/nvforest/detail/handle.pxd index 5ad107e..033caf3 100644 --- a/python/nvforest/nvforest/detail/raft_proto/handle.pxd +++ b/python/nvforest/nvforest/detail/handle.pxd @@ -3,17 +3,18 @@ # SPDX-License-Identifier: Apache-2.0 # -from pylibraft.common.handle cimport handle_t as raft_handle_t +from libcpp.memory cimport unique_ptr from nvforest.detail.raft_proto.cuda_stream cimport ( cuda_stream as raft_proto_stream_t, ) -cdef extern from "nvforest/detail/raft_proto/handle.hpp" namespace "raft_proto" nogil: +cdef extern from "nvforest/handle.hpp" namespace "nvforest" nogil: cdef cppclass handle_t: handle_t() except + - handle_t(const raft_handle_t* handle_ptr) except + - handle_t(const raft_handle_t& handle) except + raft_proto_stream_t get_next_usable_stream() except + void synchronize() except+ + +cdef class Handle: + cdef unique_ptr[handle_t] c_obj diff --git a/python/nvforest/nvforest/detail/handle.pyx b/python/nvforest/nvforest/detail/handle.pyx new file mode 100644 index 0000000..e4e9ae1 --- /dev/null +++ b/python/nvforest/nvforest/detail/handle.pyx @@ -0,0 +1,17 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +cdef class Handle: + def __cinit__(self, c_obj=None): + self.c_obj.reset(new handle_t()) + + def __getstate__(self): + return object() + + def __setstate__(self, state): + self.c_obj.reset(new handle_t()) + + def getHandle(self): + return self.c_obj.get() From f042cc028868e4e79b925cbfe6adb8609c59bac3 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 15 Apr 2026 21:01:01 -0700 Subject: [PATCH 04/35] Remove pylibraft dependency --- ci/release/update-version.sh | 1 - .../all_cuda-129_arch-aarch64.yaml | 1 - .../all_cuda-129_arch-x86_64.yaml | 1 - .../all_cuda-131_arch-aarch64.yaml | 1 - .../all_cuda-131_arch-x86_64.yaml | 1 - conda/recipes/nvforest/recipe.yaml | 2 -- dependencies.yaml | 29 ------------------- python/agents.md | 2 +- python/nvforest/pyproject.toml | 2 -- 9 files changed, 1 insertion(+), 39 deletions(-) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 6e177e8..35e3775 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -112,7 +112,6 @@ DEPENDENCIES=( libraft libraft-headers librmm - pylibraft rapids-xgboost rmm ) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 1e40579..03854c5 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -29,7 +29,6 @@ dependencies: - pandas - pre-commit - pydata-sphinx-theme -- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - pytest-xdist diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index b9b9608..b9dab65 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -29,7 +29,6 @@ dependencies: - pandas - pre-commit - pydata-sphinx-theme -- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - pytest-xdist diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 1a7a0d8..062480b 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -29,7 +29,6 @@ dependencies: - pandas - pre-commit - pydata-sphinx-theme -- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - pytest-xdist diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index 52489d8..4cefcc8 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -29,7 +29,6 @@ dependencies: - pandas - pre-commit - pydata-sphinx-theme -- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - pytest-xdist diff --git a/conda/recipes/nvforest/recipe.yaml b/conda/recipes/nvforest/recipe.yaml index fe70082..90b24e4 100644 --- a/conda/recipes/nvforest/recipe.yaml +++ b/conda/recipes/nvforest/recipe.yaml @@ -80,7 +80,6 @@ requirements: - cython >=3.0.0 - libnvforest =${{ version }} - pip - - pylibraft =${{ minor_version }} - python =${{ py_abi_min }} - python-abi3 ${{ py_abi_min }}.* - rapids-build-backend >=0.4.0,<0.5.0.dev0 @@ -96,7 +95,6 @@ requirements: - libnvforest =${{ version }} - numpy >=1.23,<3.0a0 - scikit-learn >=1.4 - - pylibraft =${{ minor_version }} - python - treelite ${{ treelite_version }} - cuda-cudart diff --git a/dependencies.yaml b/dependencies.yaml index ae76ff9..f6b5f86 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -16,7 +16,6 @@ files: - depends_on_cupy - depends_on_libraft - depends_on_librmm - - depends_on_pylibraft - depends_on_rapids_logger - develop - docs @@ -37,7 +36,6 @@ files: - depends_on_cupy - depends_on_libraft - depends_on_librmm - - depends_on_pylibraft - depends_on_rapids_logger - py_build_nvforest - py_run_nvforest @@ -115,7 +113,6 @@ files: - depends_on_libnvforest - depends_on_libraft - depends_on_librmm - - depends_on_pylibraft - py_build_nvforest py_run_nvforest: output: pyproject @@ -126,7 +123,6 @@ files: - depends_on_cuda_python - depends_on_cupy - depends_on_libnvforest - - depends_on_pylibraft - py_run_nvforest py_test_nvforest: output: pyproject @@ -510,31 +506,6 @@ dependencies: packages: - librmm-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*librmm_unsuffixed]} - depends_on_pylibraft: - common: - - output_types: conda - packages: - - &pylibraft_unsuffixed pylibraft==26.6.*,>=0.0.0a0 - - output_types: requirements - packages: - # pip recognizes the index as a global option for the requirements.txt file - - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple - specific: - - output_types: [requirements, pyproject] - matrices: - - matrix: - cuda: "12.*" - cuda_suffixed: "true" - packages: - - pylibraft-cu12==26.6.*,>=0.0.0a0 - - matrix: - cuda: "13.*" - cuda_suffixed: "true" - packages: - - pylibraft-cu13==26.6.*,>=0.0.0a0 - - matrix: - packages: - - *pylibraft_unsuffixed depends_on_rapids_logger: common: - output_types: [conda, requirements, pyproject] diff --git a/python/agents.md b/python/agents.md index 093ce8a..3e1bd9f 100644 --- a/python/agents.md +++ b/python/agents.md @@ -249,7 +249,7 @@ model = treelite.Model.load(filepath, format='xgboost_json') **Lightweight Design Philosophy**: nvForest must remain a lean, focused inference library. When reviewing changes that add dependencies: - **Question every new dependency**: Is it absolutely necessary? Can we achieve the same with existing deps? -- **Allowed dependencies**: numpy, treelite, and RAPIDS core libs (rmm, pylibraft) +- **Allowed dependencies**: numpy, treelite, and RAPIDS core libs (rmm) - **Avoid**: Large ML frameworks, libraries with heavy transitive dependencies, optional "nice-to-have" deps - **Install size matters**: New deps should not significantly increase wheel/conda package size - **Runtime dependencies are costly**: Each new import adds startup time and potential version conflicts diff --git a/python/nvforest/pyproject.toml b/python/nvforest/pyproject.toml index 5fae739..b88eac0 100644 --- a/python/nvforest/pyproject.toml +++ b/python/nvforest/pyproject.toml @@ -31,7 +31,6 @@ dependencies = [ "cupy-cuda13x>=13.6.0", "libnvforest==26.6.*,>=0.0.0a0", "numpy>=1.23,<3.0a0", - "pylibraft==26.6.*,>=0.0.0a0", "scikit-learn>=1.5", "treelite>=4.6.1,<5.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -97,7 +96,6 @@ requires = [ "libraft==26.6.*,>=0.0.0a0", "librmm==26.6.*,>=0.0.0a0", "ninja", - "pylibraft==26.6.*,>=0.0.0a0", "treelite>=4.6.1,<5.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From 97a897661e1b3d0fd78edd69c7fd7db2859515da Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 16 Apr 2026 11:19:34 -0700 Subject: [PATCH 05/35] Add libraft-headers to Conda recipe --- conda/recipes/nvforest/recipe.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/nvforest/recipe.yaml b/conda/recipes/nvforest/recipe.yaml index 90b24e4..a1c58cf 100644 --- a/conda/recipes/nvforest/recipe.yaml +++ b/conda/recipes/nvforest/recipe.yaml @@ -80,6 +80,7 @@ requirements: - cython >=3.0.0 - libnvforest =${{ version }} - pip + - libraft-headers =${{ minor_version }} - python =${{ py_abi_min }} - python-abi3 ${{ py_abi_min }}.* - rapids-build-backend >=0.4.0,<0.5.0.dev0 From ab44b9223ef510f6146448612cb68a00d0778fcd Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 16 Apr 2026 14:22:56 -0700 Subject: [PATCH 06/35] Remove unused parameter --- python/nvforest/nvforest/detail/handle.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/nvforest/nvforest/detail/handle.pyx b/python/nvforest/nvforest/detail/handle.pyx index e4e9ae1..9f517ab 100644 --- a/python/nvforest/nvforest/detail/handle.pyx +++ b/python/nvforest/nvforest/detail/handle.pyx @@ -4,7 +4,7 @@ # cdef class Handle: - def __cinit__(self, c_obj=None): + def __cinit__(self): self.c_obj.reset(new handle_t()) def __getstate__(self): From b781fda030aef2e07039ebb5495bd4aa181dcc43 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 16 Apr 2026 16:51:51 -0700 Subject: [PATCH 07/35] Revert "Add libraft-headers to Conda recipe" This reverts commit 97a897661e1b3d0fd78edd69c7fd7db2859515da. --- conda/recipes/nvforest/recipe.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/conda/recipes/nvforest/recipe.yaml b/conda/recipes/nvforest/recipe.yaml index a1c58cf..90b24e4 100644 --- a/conda/recipes/nvforest/recipe.yaml +++ b/conda/recipes/nvforest/recipe.yaml @@ -80,7 +80,6 @@ requirements: - cython >=3.0.0 - libnvforest =${{ version }} - pip - - libraft-headers =${{ minor_version }} - python =${{ py_abi_min }} - python-abi3 ${{ py_abi_min }}.* - rapids-build-backend >=0.4.0,<0.5.0.dev0 From 9000a73d1294512b301db931473b48f0842e6f3d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 16 Apr 2026 17:09:45 -0700 Subject: [PATCH 08/35] Revert "Remove pylibraft dependency" This reverts commit f042cc028868e4e79b925cbfe6adb8609c59bac3. --- ci/release/update-version.sh | 1 + .../all_cuda-129_arch-aarch64.yaml | 1 + .../all_cuda-129_arch-x86_64.yaml | 1 + .../all_cuda-131_arch-aarch64.yaml | 1 + .../all_cuda-131_arch-x86_64.yaml | 1 + conda/recipes/nvforest/recipe.yaml | 2 ++ dependencies.yaml | 29 +++++++++++++++++++ python/agents.md | 2 +- python/nvforest/pyproject.toml | 2 ++ 9 files changed, 39 insertions(+), 1 deletion(-) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 35e3775..6e177e8 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -112,6 +112,7 @@ DEPENDENCIES=( libraft libraft-headers librmm + pylibraft rapids-xgboost rmm ) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 03854c5..1e40579 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -29,6 +29,7 @@ dependencies: - pandas - pre-commit - pydata-sphinx-theme +- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - pytest-xdist diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index b9dab65..b9b9608 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -29,6 +29,7 @@ dependencies: - pandas - pre-commit - pydata-sphinx-theme +- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - pytest-xdist diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 062480b..1a7a0d8 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -29,6 +29,7 @@ dependencies: - pandas - pre-commit - pydata-sphinx-theme +- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - pytest-xdist diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index 4cefcc8..52489d8 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -29,6 +29,7 @@ dependencies: - pandas - pre-commit - pydata-sphinx-theme +- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - pytest-xdist diff --git a/conda/recipes/nvforest/recipe.yaml b/conda/recipes/nvforest/recipe.yaml index 90b24e4..fe70082 100644 --- a/conda/recipes/nvforest/recipe.yaml +++ b/conda/recipes/nvforest/recipe.yaml @@ -80,6 +80,7 @@ requirements: - cython >=3.0.0 - libnvforest =${{ version }} - pip + - pylibraft =${{ minor_version }} - python =${{ py_abi_min }} - python-abi3 ${{ py_abi_min }}.* - rapids-build-backend >=0.4.0,<0.5.0.dev0 @@ -95,6 +96,7 @@ requirements: - libnvforest =${{ version }} - numpy >=1.23,<3.0a0 - scikit-learn >=1.4 + - pylibraft =${{ minor_version }} - python - treelite ${{ treelite_version }} - cuda-cudart diff --git a/dependencies.yaml b/dependencies.yaml index f6b5f86..ae76ff9 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -16,6 +16,7 @@ files: - depends_on_cupy - depends_on_libraft - depends_on_librmm + - depends_on_pylibraft - depends_on_rapids_logger - develop - docs @@ -36,6 +37,7 @@ files: - depends_on_cupy - depends_on_libraft - depends_on_librmm + - depends_on_pylibraft - depends_on_rapids_logger - py_build_nvforest - py_run_nvforest @@ -113,6 +115,7 @@ files: - depends_on_libnvforest - depends_on_libraft - depends_on_librmm + - depends_on_pylibraft - py_build_nvforest py_run_nvforest: output: pyproject @@ -123,6 +126,7 @@ files: - depends_on_cuda_python - depends_on_cupy - depends_on_libnvforest + - depends_on_pylibraft - py_run_nvforest py_test_nvforest: output: pyproject @@ -506,6 +510,31 @@ dependencies: packages: - librmm-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*librmm_unsuffixed]} + depends_on_pylibraft: + common: + - output_types: conda + packages: + - &pylibraft_unsuffixed pylibraft==26.6.*,>=0.0.0a0 + - output_types: requirements + packages: + # pip recognizes the index as a global option for the requirements.txt file + - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple + specific: + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + cuda_suffixed: "true" + packages: + - pylibraft-cu12==26.6.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - pylibraft-cu13==26.6.*,>=0.0.0a0 + - matrix: + packages: + - *pylibraft_unsuffixed depends_on_rapids_logger: common: - output_types: [conda, requirements, pyproject] diff --git a/python/agents.md b/python/agents.md index 3e1bd9f..093ce8a 100644 --- a/python/agents.md +++ b/python/agents.md @@ -249,7 +249,7 @@ model = treelite.Model.load(filepath, format='xgboost_json') **Lightweight Design Philosophy**: nvForest must remain a lean, focused inference library. When reviewing changes that add dependencies: - **Question every new dependency**: Is it absolutely necessary? Can we achieve the same with existing deps? -- **Allowed dependencies**: numpy, treelite, and RAPIDS core libs (rmm) +- **Allowed dependencies**: numpy, treelite, and RAPIDS core libs (rmm, pylibraft) - **Avoid**: Large ML frameworks, libraries with heavy transitive dependencies, optional "nice-to-have" deps - **Install size matters**: New deps should not significantly increase wheel/conda package size - **Runtime dependencies are costly**: Each new import adds startup time and potential version conflicts diff --git a/python/nvforest/pyproject.toml b/python/nvforest/pyproject.toml index b88eac0..5fae739 100644 --- a/python/nvforest/pyproject.toml +++ b/python/nvforest/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "cupy-cuda13x>=13.6.0", "libnvforest==26.6.*,>=0.0.0a0", "numpy>=1.23,<3.0a0", + "pylibraft==26.6.*,>=0.0.0a0", "scikit-learn>=1.5", "treelite>=4.6.1,<5.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -96,6 +97,7 @@ requires = [ "libraft==26.6.*,>=0.0.0a0", "librmm==26.6.*,>=0.0.0a0", "ninja", + "pylibraft==26.6.*,>=0.0.0a0", "treelite>=4.6.1,<5.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From cd9bece457f113b0865eb11ec82c6c90588328ab Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 16 Apr 2026 17:10:31 -0700 Subject: [PATCH 09/35] Remove custom handle types entirely --- .../nvforest/detail/raft_proto/handle.hpp | 43 ------------- cpp/include/nvforest/forest_model.hpp | 24 ++++---- cpp/include/nvforest/handle.hpp | 60 ------------------- cpp/tests/treelite_importer.cpp | 11 ++-- python/nvforest/CMakeLists.txt | 6 -- python/nvforest/nvforest/_handle.py | 5 +- .../nvforest/detail/forest_inference.pyx | 21 +++++-- python/nvforest/nvforest/detail/handle.pxd | 20 ------- python/nvforest/nvforest/detail/handle.pyx | 17 ------ 9 files changed, 37 insertions(+), 170 deletions(-) delete mode 100644 cpp/include/nvforest/detail/raft_proto/handle.hpp delete mode 100644 cpp/include/nvforest/handle.hpp delete mode 100644 python/nvforest/nvforest/detail/handle.pxd delete mode 100644 python/nvforest/nvforest/detail/handle.pyx diff --git a/cpp/include/nvforest/detail/raft_proto/handle.hpp b/cpp/include/nvforest/detail/raft_proto/handle.hpp deleted file mode 100644 index 086d61b..0000000 --- a/cpp/include/nvforest/detail/raft_proto/handle.hpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once -#include - -#include -#include -#ifdef NVFOREST_ENABLE_GPU -#include -#endif - -namespace raft_proto { -#ifdef NVFOREST_ENABLE_GPU -struct handle_t { - handle_t(raft::handle_t const* handle_ptr = nullptr) : raft_handle_{handle_ptr} {} - handle_t(raft::handle_t const& raft_handle) : raft_handle_{&raft_handle} {} - auto get_next_usable_stream() const - { - return raft_proto::cuda_stream{raft_handle_->get_next_usable_stream().value()}; - } - auto get_stream_pool_size() const { return raft_handle_->get_stream_pool_size(); } - auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); } - void synchronize() const - { - raft_handle_->sync_stream_pool(); - raft_handle_->sync_stream(); - } - - private: - // Have to store a pointer because handle is not movable - raft::handle_t const* raft_handle_; -}; -#else -struct handle_t { - auto get_next_usable_stream() const { return raft_proto::cuda_stream{}; } - auto get_stream_pool_size() const { return std::size_t{}; } - auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); } - void synchronize() const {} -}; -#endif -} // namespace raft_proto diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index eecda34..2cd36df 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -7,9 +7,10 @@ #include #include #include -#include #include +#include + #include #include #include @@ -152,7 +153,7 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] handle The nvforest::handle_t which will be used to provide + * @param[in] resource RAFT resource which will be used to provide * streams for evaluation. * @param[out] output The buffer where model output should be stored. If * this buffer is on host while the model is on device or vice versa, @@ -177,31 +178,32 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(handle_t const& handle, + void predict(raft::device_resources const& resource, raft_proto::buffer& output, raft_proto::buffer const& input, infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { std::visit( - [this, predict_type, &handle, &output, &input, &specified_chunk_size]( + [this, predict_type, &resource, &output, &input, &specified_chunk_size]( auto&& concrete_forest) { using model_io_t = typename std::remove_reference_t::io_type; if constexpr (std::is_same_v) { if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) { concrete_forest.predict( - output, input, handle.get_next_usable_stream(), predict_type, specified_chunk_size); + output, input, resource.get_next_usable_stream(), predict_type, specified_chunk_size); } else { auto constexpr static const MIN_CHUNKS_PER_PARTITION = std::size_t{64}; auto constexpr static const MAX_CHUNK_SIZE = std::size_t{64}; - auto row_count = input.size() / num_features(); + auto row_count = input.size() / num_features(); + auto usable_stream_count = std::max(resource.get_stream_pool_size(), std::size_t{1}); auto partition_size = - std::max(raft_proto::ceildiv(row_count, handle.get_usable_stream_count()), + std::max(raft_proto::ceildiv(row_count, usable_stream_count), specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION); auto partition_count = raft_proto::ceildiv(row_count, partition_size); for (auto i = std::size_t{}; i < partition_count; ++i) { - auto stream = handle.get_next_usable_stream(); + auto stream = resource.get_next_usable_stream(); auto rows_in_this_partition = std::min(partition_size, row_count - i * partition_size); auto partition_in = raft_proto::buffer{}; @@ -252,7 +254,7 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] handle The nvforest::handle_t which will be used to provide + * @param[in] resource RAFT resource which will be used to provide * streams for evaluation. * @param[out] output Pointer to the memory location where output should end * up @@ -276,7 +278,7 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(handle_t const& handle, + void predict(raft::device_resources const& resource, io_t* output, io_t* input, std::size_t num_rows, @@ -296,7 +298,7 @@ struct forest_model { raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id}; auto in_buffer = raft_proto::buffer{input, num_rows * num_features(), in_mem_type, current_device_id}; - predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size); + predict(resource, out_buffer, in_buffer, predict_type, specified_chunk_size); } private: diff --git a/cpp/include/nvforest/handle.hpp b/cpp/include/nvforest/handle.hpp deleted file mode 100644 index 5ea4f3b..0000000 --- a/cpp/include/nvforest/handle.hpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once -#include - -#ifdef NVFOREST_ENABLE_GPU -#include - -#include -#endif - -namespace nvforest { - -#ifdef NVFOREST_ENABLE_GPU -/** - * A thin wrapper around raft_proto::handle_t that owns the underlying raft::handle_t. - * - * Default construction automatically creates both a raft::handle_t and the - * raft_proto::handle_t that references it, so callers do not need to manage - * RAFT handles directly. - */ -struct handle_t { - /** Default constructor: creates and owns a raft::handle_t and wraps it */ - handle_t() - : owned_raft_handle_{std::make_unique()}, - raft_proto_handle_{*owned_raft_handle_} - { - } - - /** Wrap an externally-owned raft::handle_t without taking ownership */ - handle_t(raft::handle_t const& raft_handle) : raft_proto_handle_{raft_handle} {} - - auto get_next_usable_stream() const { return raft_proto_handle_.get_next_usable_stream(); } - auto get_stream_pool_size() const { return raft_proto_handle_.get_stream_pool_size(); } - auto get_usable_stream_count() const { return raft_proto_handle_.get_usable_stream_count(); } - void synchronize() const { raft_proto_handle_.synchronize(); } - - private: - // Null when wrapping an external raft::handle_t - std::unique_ptr owned_raft_handle_; - raft_proto::handle_t raft_proto_handle_; -}; -#else -/** - * CPU-only handle: thin wrapper around the no-op raft_proto::handle_t. - */ -struct handle_t { - auto get_next_usable_stream() const { return raft_proto_handle_.get_next_usable_stream(); } - auto get_stream_pool_size() const { return raft_proto_handle_.get_stream_pool_size(); } - auto get_usable_stream_count() const { return raft_proto_handle_.get_usable_stream_count(); } - void synchronize() const { raft_proto_handle_.synchronize(); } - - private: - raft_proto::handle_t raft_proto_handle_; -}; -#endif - -} // namespace nvforest diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp index fcd8ad2..f2b3083 100644 --- a/cpp/tests/treelite_importer.cpp +++ b/cpp/tests/treelite_importer.cpp @@ -4,11 +4,12 @@ */ #include -#include #include #include #include +#include + #include #include #include @@ -338,11 +339,11 @@ TEST(TreeliteImporter, DegenerateTree) auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); ASSERT_FALSE(fil_model.has_vector_leaves()); - auto handle = nvforest::handle_t{}; + auto resource = raft::device_resources{}; auto X = std::vector{0.0}; auto preds = std::vector(1, 0.0); auto expected_preds = std::vector{1.0}; - fil_model.predict(handle, + fil_model.predict(resource, preds.data(), X.data(), 1, @@ -359,11 +360,11 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); ASSERT_TRUE(fil_model.has_vector_leaves()); - auto handle = nvforest::handle_t{}; + auto resource = raft::device_resources{}; auto X = std::vector{0.0}; auto preds = std::vector(2, 0.0); auto expected_preds = std::vector{0.5, 0.5}; - fil_model.predict(handle, + fil_model.predict(resource, preds.data(), X.data(), 1, diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt index dfe234d..fd548e5 100644 --- a/python/nvforest/CMakeLists.txt +++ b/python/nvforest/CMakeLists.txt @@ -72,12 +72,6 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_) -rapids_cython_create_modules( - CXX - SOURCE_FILES "nvforest/detail/handle.pyx" - LINKED_LIBRARIES "${linked_libraries}" - INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_) - rapids_cython_create_modules( CXX SOURCE_FILES "nvforest/detail/treelite.pyx" diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py index 8b68338..0633eee 100644 --- a/python/nvforest/nvforest/_handle.py +++ b/python/nvforest/nvforest/_handle.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # -from nvforest.detail.handle import Handle as _Handle +from pylibraft.common.handle import DeviceResources as RaftDeviceResources -Handle = _Handle +# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.DeviceResources +Handle = RaftDeviceResources diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx index dec4e22..946c3fa 100644 --- a/python/nvforest/nvforest/detail/forest_inference.pyx +++ b/python/nvforest/nvforest/detail/forest_inference.pyx @@ -15,8 +15,8 @@ from nvforest.detail.treelite import safe_treelite_call from cython.operator cimport dereference as deref from libc.stdint cimport uint32_t, uintptr_t from libcpp cimport bool +from rmm.librmm.cuda_stream_view cimport cuda_stream_view -from nvforest.detail.handle cimport handle_t from nvforest.detail.infer_kind cimport infer_kind from nvforest.detail.postprocessing cimport element_op, row_op from nvforest.detail.raft_proto.cuda_stream cimport ( @@ -34,10 +34,18 @@ from nvforest.detail.treelite cimport ( ) +cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil: + cdef cppclass device_resources: + device_resources() except + + cuda_stream_view get_next_usable_stream() except + + void sync_stream() except + + void sync_stream_pool() except + + + cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil: cdef cppclass forest_model: void predict[io_t]( - const handle_t&, + const device_resources&, io_t*, io_t*, size_t, @@ -70,7 +78,7 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil: cdef class ForestInference_impl(): cdef forest_model model cdef object py_handle - cdef handle_t* c_handle + cdef device_resources* c_handle cdef object device def __cinit__( @@ -85,7 +93,7 @@ cdef class ForestInference_impl(): device_id: Optional[int] = None, ): self.py_handle = handle - self.c_handle = self.py_handle.getHandle() + self.c_handle = self.py_handle.getHandle() cdef optional[bool] use_double_precision_c cdef bool use_double_precision_bool @@ -130,7 +138,7 @@ cdef class ForestInference_impl(): use_double_precision_c, dev_type, device_id, - self.c_handle.get_next_usable_stream() + self.c_handle.get_next_usable_stream().value() ) safe_treelite_call( @@ -262,7 +270,8 @@ cdef class ForestInference_impl(): ) if self.device == "gpu": - self.c_handle.synchronize() + self.c_handle.sync_stream_pool() + self.c_handle.sync_stream() return preds diff --git a/python/nvforest/nvforest/detail/handle.pxd b/python/nvforest/nvforest/detail/handle.pxd deleted file mode 100644 index 033caf3..0000000 --- a/python/nvforest/nvforest/detail/handle.pxd +++ /dev/null @@ -1,20 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# - -from libcpp.memory cimport unique_ptr - -from nvforest.detail.raft_proto.cuda_stream cimport ( - cuda_stream as raft_proto_stream_t, -) - - -cdef extern from "nvforest/handle.hpp" namespace "nvforest" nogil: - cdef cppclass handle_t: - handle_t() except + - raft_proto_stream_t get_next_usable_stream() except + - void synchronize() except+ - -cdef class Handle: - cdef unique_ptr[handle_t] c_obj diff --git a/python/nvforest/nvforest/detail/handle.pyx b/python/nvforest/nvforest/detail/handle.pyx deleted file mode 100644 index 9f517ab..0000000 --- a/python/nvforest/nvforest/detail/handle.pyx +++ /dev/null @@ -1,17 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# - -cdef class Handle: - def __cinit__(self): - self.c_obj.reset(new handle_t()) - - def __getstate__(self): - return object() - - def __setstate__(self, state): - self.c_obj.reset(new handle_t()) - - def getHandle(self): - return self.c_obj.get() From fb4355affc443b0111e154d00676fb322a1f06e4 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 16 Apr 2026 17:43:32 -0700 Subject: [PATCH 10/35] Add new interface to auto-instantiate raft resource --- cpp/include/nvforest/forest_model.hpp | 44 +++++++++++++++++++++++++++ cpp/tests/treelite_importer.cpp | 4 +-- docs/source/getting_started.rst | 6 +--- 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 2cd36df..be45584 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -301,6 +301,50 @@ struct forest_model { predict(resource, out_buffer, in_buffer, predict_type, specified_chunk_size); } + /** + * Perform inference on given input (with auto-instantiated RAFT resource) + * + * @param[out] output Pointer to the memory location where output should end + * up + * @param[in] input Pointer to the input data + * @param[in] num_rows Number of rows in input + * @param[in] out_mem_type The memory type (device/host) of the output + * buffer + * @param[in] in_mem_type The memory type (device/host) of the input buffer + * @param[in] predict_type Type of inference to perform. Defaults to summing + * the outputs of all trees and produce an output per row. If set to + * "per_tree", we will instead output all outputs of individual trees. + * If set to "leaf_id", we will output the integer ID of the leaf node + * for each tree. + * @param[in] specified_chunk_size: Specifies the mini-batch size for + * processing. This has different meanings on CPU and GPU, but on GPU it + * corresponds to the number of rows evaluated per inference iteration + * on a single block. It can take on any power of 2 from 1 to 32, and + * runtime performance is quite sensitive to the value chosen. In general, + * larger batches benefit from higher values, but it is hard to predict the + * optimal value a priori. If omitted, a heuristic will be used to select a + * reasonable value. On CPU, this argument can generally just be omitted. + */ + template + void predict(io_t* output, + io_t* input, + std::size_t num_rows, + raft_proto::device_type out_mem_type, + raft_proto::device_type in_mem_type, + infer_kind predict_type = infer_kind::default_kind, + std::optional specified_chunk_size = std::nullopt) + { + auto resource = raft::device_resources{}; + predict(resource, + output, + input, + num_rows, + out_mem_type, + in_mem_type, + predict_type, + specified_chunk_size); + } + private: decision_forest_variant decision_forest_; }; diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp index f2b3083..f6f3413 100644 --- a/cpp/tests/treelite_importer.cpp +++ b/cpp/tests/treelite_importer.cpp @@ -360,12 +360,10 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); ASSERT_TRUE(fil_model.has_vector_leaves()); - auto resource = raft::device_resources{}; auto X = std::vector{0.0}; auto preds = std::vector(2, 0.0); auto expected_preds = std::vector{0.5, 0.5}; - fil_model.predict(resource, - preds.data(), + fil_model.predict(preds.data(), X.data(), 1, raft_proto::device_type::cpu, diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 2522f7d..e94b666 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -195,14 +195,10 @@ Now that the tree model is fully imported into nvForest, let's run inference: .. code-block:: cpp - #include - - auto handle = nvforest::handle_t{}; - // Assumption: // * Both output and input are in the GPU memory. // * The input buffer should be of dimension (num_rows, num_features) // * The output buffer should be of dimension (num_rows, fm.num_outputs()) - fm.predict(handle, output, input, num_rows, + fm.predict(output, input, num_rows, raft_proto::device_type::gpu, raft_proto::device_type::gpu, nvforest::infer_kind::default_kind); From f4c6fee4c6ebd1bd7d4bcb289b41a963a760cc2e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 17 Apr 2026 21:16:13 -0700 Subject: [PATCH 11/35] Add a note about re-using the RAFT handle --- docs/source/getting_started.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index e94b666..662e09a 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -202,3 +202,20 @@ Now that the tree model is fully imported into nvForest, let's run inference: fm.predict(output, input, num_rows, raft_proto::device_type::gpu, raft_proto::device_type::gpu, nvforest::infer_kind::default_kind); + +.. note:: Reuse the resource handle to reduce overhead + + nvForest internally creates a resource handle (``raft::device_resources``) + to manage GPU resources. Creation of the resource handle adds a slight + performance overhead. If you plan to call :cpp:func:`nvforest::predict` + multiple times, consider creating the resource handle explicitly + and re-using the handle between the function calls. + + .. code-block:: cpp + + #include + + auto resource = raft::device_resources{}; + fm.predict(resource, output, input, num_rows, + raft_proto::device_type::gpu, raft_proto::device_type::gpu, + nvforest::infer_kind::default_kind); From 59cfadc205c6ef55fcfcbd6454849e7290b710c9 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 17 Apr 2026 21:19:21 -0700 Subject: [PATCH 12/35] Add a comment --- docs/source/getting_started.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 662e09a..f5d10db 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -216,6 +216,8 @@ Now that the tree model is fully imported into nvForest, let's run inference: #include auto resource = raft::device_resources{}; + + // Calling predict multiple times ... fm.predict(resource, output, input, num_rows, raft_proto::device_type::gpu, raft_proto::device_type::gpu, nvforest::infer_kind::default_kind); From 8efe44e5edddca432dd109da19854c9b293a5a40 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 18 Apr 2026 14:03:52 -0700 Subject: [PATCH 13/35] Improved formatting --- docs/source/getting_started.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index f5d10db..3257387 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -205,11 +205,13 @@ Now that the tree model is fully imported into nvForest, let's run inference: .. note:: Reuse the resource handle to reduce overhead - nvForest internally creates a resource handle (``raft::device_resources``) + nvForest internally creates a resource handle + (:cpp:class:`raft::device_resources`) to manage GPU resources. Creation of the resource handle adds a slight - performance overhead. If you plan to call :cpp:func:`nvforest::predict` - multiple times, consider creating the resource handle explicitly - and re-using the handle between the function calls. + performance overhead. If you plan to call + :cpp:func:`~nvforest::forest_model::predict` multiple times, consider + creating the resource handle explicitly and re-using the handle between + the function calls. .. code-block:: cpp From 564e38c4f9900838a6cb8ea1f99c813e484075ec Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Apr 2026 18:50:05 -0700 Subject: [PATCH 14/35] Create device_resource wrapper in C++ --- cpp/include/nvforest/device_resources.hpp | 22 ++++++++++++++++++++++ cpp/include/nvforest/forest_model.hpp | 15 +++++++-------- cpp/tests/treelite_importer.cpp | 5 ++--- 3 files changed, 31 insertions(+), 11 deletions(-) create mode 100644 cpp/include/nvforest/device_resources.hpp diff --git a/cpp/include/nvforest/device_resources.hpp b/cpp/include/nvforest/device_resources.hpp new file mode 100644 index 0000000..04621d3 --- /dev/null +++ b/cpp/include/nvforest/device_resources.hpp @@ -0,0 +1,22 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include + +namespace nvforest { + +class device_resources { + public: + device_resources() : res_{} {} + + auto get_next_usable_stream() const { return res_.get_next_usable_stream(); } + auto get_stream_pool_size() const { return res_.get_stream_pool_size(); } + + private: + raft::device_resources res_; +}; + +} // namespace nvforest diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index be45584..84987a8 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -7,10 +7,9 @@ #include #include #include +#include #include -#include - #include #include #include @@ -153,7 +152,7 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] resource RAFT resource which will be used to provide + * @param[in] resource device resource which will be used to provide * streams for evaluation. * @param[out] output The buffer where model output should be stored. If * this buffer is on host while the model is on device or vice versa, @@ -178,7 +177,7 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(raft::device_resources const& resource, + void predict(nvforest::device_resources const& resource, raft_proto::buffer& output, raft_proto::buffer const& input, infer_kind predict_type = infer_kind::default_kind, @@ -254,7 +253,7 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] resource RAFT resource which will be used to provide + * @param[in] resource device resource which will be used to provide * streams for evaluation. * @param[out] output Pointer to the memory location where output should end * up @@ -278,7 +277,7 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(raft::device_resources const& resource, + void predict(nvforest::device_resources const& resource, io_t* output, io_t* input, std::size_t num_rows, @@ -302,7 +301,7 @@ struct forest_model { } /** - * Perform inference on given input (with auto-instantiated RAFT resource) + * Perform inference on given input (with auto-instantiated device resource) * * @param[out] output Pointer to the memory location where output should end * up @@ -334,7 +333,7 @@ struct forest_model { infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { - auto resource = raft::device_resources{}; + auto resource = nvforest::device_resources{}; predict(resource, output, input, diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp index f6f3413..d79c423 100644 --- a/cpp/tests/treelite_importer.cpp +++ b/cpp/tests/treelite_importer.cpp @@ -4,12 +4,11 @@ */ #include +#include #include #include #include -#include - #include #include #include @@ -339,7 +338,7 @@ TEST(TreeliteImporter, DegenerateTree) auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); ASSERT_FALSE(fil_model.has_vector_leaves()); - auto resource = raft::device_resources{}; + auto resource = nvforest::device_resources{}; auto X = std::vector{0.0}; auto preds = std::vector(1, 0.0); auto expected_preds = std::vector{1.0}; From 13ddf90417114f9426f15147b162bd86520e9b2d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Apr 2026 19:02:44 -0700 Subject: [PATCH 15/35] Define DeviceResources in Python pkg --- cpp/include/nvforest/device_resources.hpp | 5 +++ python/nvforest/CMakeLists.txt | 6 +++ python/nvforest/nvforest/__init__.py | 4 +- python/nvforest/nvforest/_device_resources.py | 10 +++++ python/nvforest/nvforest/_factory.py | 36 ++++++++--------- python/nvforest/nvforest/_forest_inference.py | 40 +++++++++---------- python/nvforest/nvforest/_handle.py | 9 ----- .../nvforest/detail/device_resources.pxd | 20 ++++++++++ .../nvforest/detail/device_resources.pyx | 17 ++++++++ .../nvforest/detail/forest_inference.pyx | 37 +++++++---------- 10 files changed, 112 insertions(+), 72 deletions(-) create mode 100644 python/nvforest/nvforest/_device_resources.py delete mode 100644 python/nvforest/nvforest/_handle.py create mode 100644 python/nvforest/nvforest/detail/device_resources.pxd create mode 100644 python/nvforest/nvforest/detail/device_resources.pyx diff --git a/cpp/include/nvforest/device_resources.hpp b/cpp/include/nvforest/device_resources.hpp index 04621d3..ebe4543 100644 --- a/cpp/include/nvforest/device_resources.hpp +++ b/cpp/include/nvforest/device_resources.hpp @@ -14,6 +14,11 @@ class device_resources { auto get_next_usable_stream() const { return res_.get_next_usable_stream(); } auto get_stream_pool_size() const { return res_.get_stream_pool_size(); } + void synchronize() const + { + res_.sync_stream_pool(); + res_.sync_stream(); + } private: raft::device_resources res_; diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt index fd548e5..1b37788 100644 --- a/python/nvforest/CMakeLists.txt +++ b/python/nvforest/CMakeLists.txt @@ -77,3 +77,9 @@ rapids_cython_create_modules( SOURCE_FILES "nvforest/detail/treelite.pyx" LINKED_LIBRARIES "${linked_libraries}" INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "nvforest/detail/device_resources.pyx" + LINKED_LIBRARIES "${linked_libraries}" + INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_) diff --git a/python/nvforest/nvforest/__init__.py b/python/nvforest/nvforest/__init__.py index 4fc9b15..27a222a 100644 --- a/python/nvforest/nvforest/__init__.py +++ b/python/nvforest/nvforest/__init__.py @@ -13,6 +13,7 @@ libnvforest.load_library() del libnvforest +from nvforest._device_resources import DeviceResources from nvforest._factory import ( load_from_sklearn, load_from_treelite_model, @@ -24,7 +25,6 @@ GPUForestInferenceClassifier, GPUForestInferenceRegressor, ) -from nvforest._handle import Handle from nvforest._version import __git_commit__, __version__ __all__ = [ @@ -32,7 +32,7 @@ "CPUForestInferenceRegressor", "GPUForestInferenceClassifier", "GPUForestInferenceRegressor", - "Handle", + "DeviceResources", "load_model", "load_from_sklearn", "load_from_treelite_model", diff --git a/python/nvforest/nvforest/_device_resources.py b/python/nvforest/nvforest/_device_resources.py new file mode 100644 index 0000000..85991ba --- /dev/null +++ b/python/nvforest/nvforest/_device_resources.py @@ -0,0 +1,10 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +from nvforest.detail.device_resources import ( + DeviceResources as _DeviceResources, +) + +DeviceResources = _DeviceResources diff --git a/python/nvforest/nvforest/_factory.py b/python/nvforest/nvforest/_factory.py index cc0700a..e640470 100644 --- a/python/nvforest/nvforest/_factory.py +++ b/python/nvforest/nvforest/_factory.py @@ -9,6 +9,7 @@ import treelite from nvforest._base import ForestInference +from nvforest._device_resources import DeviceResources from nvforest._forest_inference import ( CPUForestInferenceClassifier, CPUForestInferenceRegressor, @@ -17,7 +18,6 @@ infer_device, infer_is_classifier, ) -from nvforest._handle import Handle def get_forest_inference_class(device, is_classifier) -> type: @@ -39,7 +39,7 @@ def make_forest_inference_object( treelite_model: treelite.Model, device: str, device_id: Optional[int], - handle: Optional[Handle], + resource: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -50,7 +50,7 @@ def make_forest_inference_object( kwargs = dict( treelite_model=treelite_model, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -72,7 +72,7 @@ def load_model( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - handle: Optional[Handle] = None, + resource: Optional[DeviceResources] = None, ) -> ForestInference: """Load a model into nvForest from a serialized model file. @@ -113,10 +113,10 @@ def load_model( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - handle : nvforest.Handle or None - For GPU execution, the nvForest handle containing the stream or stream + resource : nvforest.DeviceResources or None + For GPU execution, the device resource containing the stream or stream pool to use during loading and inference. If not given, a new - handle will be constructed. + resource will be constructed. """ model_path = pathlib.Path(model_file) if not model_path.exists(): @@ -157,7 +157,7 @@ def load_model( treelite_model=tl_model, device=device, device_id=device_id, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -174,7 +174,7 @@ def load_from_sklearn( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - handle: Optional[Handle] = None, + resource: Optional[DeviceResources] = None, ) -> ForestInference: """Load a Scikit-Learn forest model to nvForest @@ -208,10 +208,10 @@ def load_from_sklearn( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - handle : nvforest.Handle or None - For GPU execution, the nvForest handle containing the stream or stream + resource : nvforest.DeviceResources or None + For GPU execution, the device resource containing the stream or stream pool to use during loading and inference. If not given, a new - handle will be constructed. + resource will be constructed. """ tl_model = treelite.sklearn.import_model(skl_model) @@ -219,7 +219,7 @@ def load_from_sklearn( treelite_model=tl_model, device=device, device_id=device_id, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -236,7 +236,7 @@ def load_from_treelite_model( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - handle: Optional[Handle] = None, + resource: Optional[DeviceResources] = None, ) -> ForestInference: """Load a Treelite forest model to nvForest @@ -270,16 +270,16 @@ def load_from_treelite_model( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - handle : nvforest.Handle or None - For GPU execution, the nvForest handle containing the stream or stream + resource : nvforest.DeviceResources or None + For GPU execution, the device resource containing the stream or stream pool to use during loading and inference. If not given, a new - handle will be constructed. + resource will be constructed. """ return make_forest_inference_object( treelite_model=tl_model, device=device, device_id=device_id, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, diff --git a/python/nvforest/nvforest/_forest_inference.py b/python/nvforest/nvforest/_forest_inference.py index 2c3c933..b3de975 100644 --- a/python/nvforest/nvforest/_forest_inference.py +++ b/python/nvforest/nvforest/_forest_inference.py @@ -18,7 +18,7 @@ from cuda.bindings import runtime from nvforest._base import ForestInferenceClassifier, ForestInferenceRegressor -from nvforest._handle import Handle +from nvforest._device_resources import DeviceResources from nvforest._typing import DataType from nvforest.detail.forest_inference import ForestInferenceImpl @@ -123,7 +123,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + resource: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -250,7 +250,7 @@ def optimize( else: test_instances[layout] = type(self)._create_with_layout( treelite_model_bytes=self.forest.treelite_model_bytes, - handle=self.forest.handle, + resource=self.forest.resource, layout=layout, default_chunk_size=None, align_bytes=self.forest.align_bytes, @@ -293,7 +293,7 @@ def optimize( # Return a new instance with optimal settings return type(self)._create_with_layout( treelite_model_bytes=self.forest.treelite_model_bytes, - handle=self.forest.handle, + resource=self.forest.resource, layout=optimal_layout, default_chunk_size=optimal_chunk_size, align_bytes=self.forest.align_bytes, @@ -310,7 +310,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[Handle] = None, + resource: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -322,7 +322,7 @@ def __init__( treelite_model=treelite_model, device="cpu", device_id=-1, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -334,7 +334,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + resource: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -346,7 +346,7 @@ def _create_with_layout( tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes) return cls( treelite_model=tl_model, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -425,7 +425,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[Handle] = None, + resource: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -437,7 +437,7 @@ def __init__( treelite_model=treelite_model, device="cpu", device_id=-1, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -449,7 +449,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + resource: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -461,7 +461,7 @@ def _create_with_layout( tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes) return cls( treelite_model=tl_model, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -532,7 +532,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[Handle] = None, + resource: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -545,7 +545,7 @@ def __init__( treelite_model=treelite_model, device="gpu", device_id=device_id, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -557,7 +557,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + resource: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -569,7 +569,7 @@ def _create_with_layout( tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes) return cls( treelite_model=tl_model, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -649,7 +649,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[Handle] = None, + resource: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -662,7 +662,7 @@ def __init__( treelite_model=treelite_model, device="gpu", device_id=device_id, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -674,7 +674,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + resource: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -686,7 +686,7 @@ def _create_with_layout( tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes) return cls( treelite_model=tl_model, - handle=handle, + resource=resource, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py deleted file mode 100644 index 0633eee..0000000 --- a/python/nvforest/nvforest/_handle.py +++ /dev/null @@ -1,9 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# - -from pylibraft.common.handle import DeviceResources as RaftDeviceResources - -# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.DeviceResources -Handle = RaftDeviceResources diff --git a/python/nvforest/nvforest/detail/device_resources.pxd b/python/nvforest/nvforest/detail/device_resources.pxd new file mode 100644 index 0000000..8aa4aef --- /dev/null +++ b/python/nvforest/nvforest/detail/device_resources.pxd @@ -0,0 +1,20 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +from libcpp.memory cimport unique_ptr + +from nvforest.detail.raft_proto.cuda_stream cimport ( + cuda_stream as raft_proto_stream_t, +) + + +cdef extern from "nvforest/device_resources.hpp" namespace "nvforest" nogil: + cdef cppclass device_resources: + device_resources() except + + raft_proto_stream_t get_next_usable_stream() except + + void synchronize() except + + +cdef class DeviceResources: + cdef unique_ptr[device_resources] c_obj diff --git a/python/nvforest/nvforest/detail/device_resources.pyx b/python/nvforest/nvforest/detail/device_resources.pyx new file mode 100644 index 0000000..98ca786 --- /dev/null +++ b/python/nvforest/nvforest/detail/device_resources.pyx @@ -0,0 +1,17 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +cdef class DeviceResources: + def __cinit__(self, c_obj=None): + self.c_obj.reset(new device_resources()) + + def __getstate__(self): + return object() + + def __setstate__(self, state): + self.c_obj.reset(new device_resources()) + + def get_c_obj(self): + return self.c_obj.get() diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx index 946c3fa..557d3ee 100644 --- a/python/nvforest/nvforest/detail/forest_inference.pyx +++ b/python/nvforest/nvforest/detail/forest_inference.pyx @@ -8,15 +8,15 @@ from typing import Optional, Union import numpy as np import treelite -from nvforest._handle import Handle +from nvforest._device_resources import DeviceResources from nvforest._typing import DataType from nvforest.detail.treelite import safe_treelite_call from cython.operator cimport dereference as deref from libc.stdint cimport uint32_t, uintptr_t from libcpp cimport bool -from rmm.librmm.cuda_stream_view cimport cuda_stream_view +from nvforest.detail.device_resources cimport device_resources from nvforest.detail.infer_kind cimport infer_kind from nvforest.detail.postprocessing cimport element_op, row_op from nvforest.detail.raft_proto.cuda_stream cimport ( @@ -34,14 +34,6 @@ from nvforest.detail.treelite cimport ( ) -cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil: - cdef cppclass device_resources: - device_resources() except + - cuda_stream_view get_next_usable_stream() except + - void sync_stream() except + - void sync_stream_pool() except + - - cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil: cdef cppclass forest_model: void predict[io_t]( @@ -77,13 +69,13 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil: cdef class ForestInference_impl(): cdef forest_model model - cdef object py_handle - cdef device_resources* c_handle + cdef object py_resource + cdef device_resources* c_resource cdef object device def __cinit__( self, - handle: object, + resource: object, tl_model_bytes: Union[bytes, bytearray], *, layout: str = "depth_first", @@ -92,8 +84,8 @@ cdef class ForestInference_impl(): device: str = "cpu", device_id: Optional[int] = None, ): - self.py_handle = handle - self.c_handle = self.py_handle.getHandle() + self.py_resource = resource + self.c_resource = self.py_resource.get_c_obj() cdef optional[bool] use_double_precision_c cdef bool use_double_precision_bool @@ -138,7 +130,7 @@ cdef class ForestInference_impl(): use_double_precision_c, dev_type, device_id, - self.c_handle.get_next_usable_stream().value() + self.c_resource.get_next_usable_stream() ) safe_treelite_call( @@ -248,7 +240,7 @@ cdef class ForestInference_impl(): if model_dtype == np.float32: self.model.predict[float]( - deref(self.c_handle), + deref(self.c_resource), out_ptr, in_ptr, n_rows, @@ -259,7 +251,7 @@ cdef class ForestInference_impl(): ) else: self.model.predict[double]( - deref(self.c_handle), + deref(self.c_resource), out_ptr, in_ptr, n_rows, @@ -270,8 +262,7 @@ cdef class ForestInference_impl(): ) if self.device == "gpu": - self.c_handle.sync_stream_pool() - self.c_handle.sync_stream() + self.c_resource.synchronize() return preds @@ -282,7 +273,7 @@ class ForestInferenceImpl: treelite_model: treelite.Model, device: str, device_id: int, - handle: Optional[Handle] = None, + resource: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -290,7 +281,7 @@ class ForestInferenceImpl: ): # Assumption: The caller needs to pass in correct (device, device_id) pair # This function will not contain any logic for auto-detecting device. - self.handle = Handle() if handle is None else handle + self.resource = DeviceResources() if resource is None else resource self._layout = layout self.precision = precision self.default_chunk_size = default_chunk_size @@ -318,7 +309,7 @@ class ForestInferenceImpl: self._treelite_model_bytes = treelite_model.serialize_bytes() self.impl = ForestInference_impl( - self.handle, + self.resource, self._treelite_model_bytes, layout=self._layout, align_bytes=self.align_bytes, From ed7627e2f3026ee997d0ddf7248a4fa2438cee62 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 21 Apr 2026 19:18:57 -0700 Subject: [PATCH 16/35] Update doc --- docs/source/getting_started.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 3257387..11d70d3 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -206,7 +206,7 @@ Now that the tree model is fully imported into nvForest, let's run inference: .. note:: Reuse the resource handle to reduce overhead nvForest internally creates a resource handle - (:cpp:class:`raft::device_resources`) + (:cpp:class:`nvforest::device_resources`) to manage GPU resources. Creation of the resource handle adds a slight performance overhead. If you plan to call :cpp:func:`~nvforest::forest_model::predict` multiple times, consider @@ -215,9 +215,9 @@ Now that the tree model is fully imported into nvForest, let's run inference: .. code-block:: cpp - #include + #include - auto resource = raft::device_resources{}; + auto resource = nvforest::device_resources{}; // Calling predict multiple times ... fm.predict(resource, output, input, num_rows, From 65b83cea10de1911ac9761bca39dc0c25c940969 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 2 May 2026 01:20:04 -0700 Subject: [PATCH 17/35] Revert "Define DeviceResources in Python pkg" This reverts commit 13ddf90417114f9426f15147b162bd86520e9b2d. --- cpp/include/nvforest/device_resources.hpp | 5 --- python/nvforest/CMakeLists.txt | 6 --- python/nvforest/nvforest/__init__.py | 4 +- python/nvforest/nvforest/_device_resources.py | 10 ----- python/nvforest/nvforest/_factory.py | 36 ++++++++--------- python/nvforest/nvforest/_forest_inference.py | 40 +++++++++---------- python/nvforest/nvforest/_handle.py | 9 +++++ .../nvforest/detail/device_resources.pxd | 20 ---------- .../nvforest/detail/device_resources.pyx | 17 -------- .../nvforest/detail/forest_inference.pyx | 37 ++++++++++------- 10 files changed, 72 insertions(+), 112 deletions(-) delete mode 100644 python/nvforest/nvforest/_device_resources.py create mode 100644 python/nvforest/nvforest/_handle.py delete mode 100644 python/nvforest/nvforest/detail/device_resources.pxd delete mode 100644 python/nvforest/nvforest/detail/device_resources.pyx diff --git a/cpp/include/nvforest/device_resources.hpp b/cpp/include/nvforest/device_resources.hpp index ebe4543..04621d3 100644 --- a/cpp/include/nvforest/device_resources.hpp +++ b/cpp/include/nvforest/device_resources.hpp @@ -14,11 +14,6 @@ class device_resources { auto get_next_usable_stream() const { return res_.get_next_usable_stream(); } auto get_stream_pool_size() const { return res_.get_stream_pool_size(); } - void synchronize() const - { - res_.sync_stream_pool(); - res_.sync_stream(); - } private: raft::device_resources res_; diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt index 1b37788..fd548e5 100644 --- a/python/nvforest/CMakeLists.txt +++ b/python/nvforest/CMakeLists.txt @@ -77,9 +77,3 @@ rapids_cython_create_modules( SOURCE_FILES "nvforest/detail/treelite.pyx" LINKED_LIBRARIES "${linked_libraries}" INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_) - -rapids_cython_create_modules( - CXX - SOURCE_FILES "nvforest/detail/device_resources.pyx" - LINKED_LIBRARIES "${linked_libraries}" - INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_) diff --git a/python/nvforest/nvforest/__init__.py b/python/nvforest/nvforest/__init__.py index 27a222a..4fc9b15 100644 --- a/python/nvforest/nvforest/__init__.py +++ b/python/nvforest/nvforest/__init__.py @@ -13,7 +13,6 @@ libnvforest.load_library() del libnvforest -from nvforest._device_resources import DeviceResources from nvforest._factory import ( load_from_sklearn, load_from_treelite_model, @@ -25,6 +24,7 @@ GPUForestInferenceClassifier, GPUForestInferenceRegressor, ) +from nvforest._handle import Handle from nvforest._version import __git_commit__, __version__ __all__ = [ @@ -32,7 +32,7 @@ "CPUForestInferenceRegressor", "GPUForestInferenceClassifier", "GPUForestInferenceRegressor", - "DeviceResources", + "Handle", "load_model", "load_from_sklearn", "load_from_treelite_model", diff --git a/python/nvforest/nvforest/_device_resources.py b/python/nvforest/nvforest/_device_resources.py deleted file mode 100644 index 85991ba..0000000 --- a/python/nvforest/nvforest/_device_resources.py +++ /dev/null @@ -1,10 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# - -from nvforest.detail.device_resources import ( - DeviceResources as _DeviceResources, -) - -DeviceResources = _DeviceResources diff --git a/python/nvforest/nvforest/_factory.py b/python/nvforest/nvforest/_factory.py index e640470..cc0700a 100644 --- a/python/nvforest/nvforest/_factory.py +++ b/python/nvforest/nvforest/_factory.py @@ -9,7 +9,6 @@ import treelite from nvforest._base import ForestInference -from nvforest._device_resources import DeviceResources from nvforest._forest_inference import ( CPUForestInferenceClassifier, CPUForestInferenceRegressor, @@ -18,6 +17,7 @@ infer_device, infer_is_classifier, ) +from nvforest._handle import Handle def get_forest_inference_class(device, is_classifier) -> type: @@ -39,7 +39,7 @@ def make_forest_inference_object( treelite_model: treelite.Model, device: str, device_id: Optional[int], - resource: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -50,7 +50,7 @@ def make_forest_inference_object( kwargs = dict( treelite_model=treelite_model, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -72,7 +72,7 @@ def load_model( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - resource: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, ) -> ForestInference: """Load a model into nvForest from a serialized model file. @@ -113,10 +113,10 @@ def load_model( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - resource : nvforest.DeviceResources or None - For GPU execution, the device resource containing the stream or stream + handle : nvforest.Handle or None + For GPU execution, the nvForest handle containing the stream or stream pool to use during loading and inference. If not given, a new - resource will be constructed. + handle will be constructed. """ model_path = pathlib.Path(model_file) if not model_path.exists(): @@ -157,7 +157,7 @@ def load_model( treelite_model=tl_model, device=device, device_id=device_id, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -174,7 +174,7 @@ def load_from_sklearn( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - resource: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, ) -> ForestInference: """Load a Scikit-Learn forest model to nvForest @@ -208,10 +208,10 @@ def load_from_sklearn( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - resource : nvforest.DeviceResources or None - For GPU execution, the device resource containing the stream or stream + handle : nvforest.Handle or None + For GPU execution, the nvForest handle containing the stream or stream pool to use during loading and inference. If not given, a new - resource will be constructed. + handle will be constructed. """ tl_model = treelite.sklearn.import_model(skl_model) @@ -219,7 +219,7 @@ def load_from_sklearn( treelite_model=tl_model, device=device, device_id=device_id, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -236,7 +236,7 @@ def load_from_treelite_model( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - resource: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, ) -> ForestInference: """Load a Treelite forest model to nvForest @@ -270,16 +270,16 @@ def load_from_treelite_model( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - resource : nvforest.DeviceResources or None - For GPU execution, the device resource containing the stream or stream + handle : nvforest.Handle or None + For GPU execution, the nvForest handle containing the stream or stream pool to use during loading and inference. If not given, a new - resource will be constructed. + handle will be constructed. """ return make_forest_inference_object( treelite_model=tl_model, device=device, device_id=device_id, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, diff --git a/python/nvforest/nvforest/_forest_inference.py b/python/nvforest/nvforest/_forest_inference.py index b3de975..2c3c933 100644 --- a/python/nvforest/nvforest/_forest_inference.py +++ b/python/nvforest/nvforest/_forest_inference.py @@ -18,7 +18,7 @@ from cuda.bindings import runtime from nvforest._base import ForestInferenceClassifier, ForestInferenceRegressor -from nvforest._device_resources import DeviceResources +from nvforest._handle import Handle from nvforest._typing import DataType from nvforest.detail.forest_inference import ForestInferenceImpl @@ -123,7 +123,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - resource: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -250,7 +250,7 @@ def optimize( else: test_instances[layout] = type(self)._create_with_layout( treelite_model_bytes=self.forest.treelite_model_bytes, - resource=self.forest.resource, + handle=self.forest.handle, layout=layout, default_chunk_size=None, align_bytes=self.forest.align_bytes, @@ -293,7 +293,7 @@ def optimize( # Return a new instance with optimal settings return type(self)._create_with_layout( treelite_model_bytes=self.forest.treelite_model_bytes, - resource=self.forest.resource, + handle=self.forest.handle, layout=optimal_layout, default_chunk_size=optimal_chunk_size, align_bytes=self.forest.align_bytes, @@ -310,7 +310,7 @@ def __init__( self, *, treelite_model: treelite.Model, - resource: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -322,7 +322,7 @@ def __init__( treelite_model=treelite_model, device="cpu", device_id=-1, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -334,7 +334,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - resource: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -346,7 +346,7 @@ def _create_with_layout( tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes) return cls( treelite_model=tl_model, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -425,7 +425,7 @@ def __init__( self, *, treelite_model: treelite.Model, - resource: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -437,7 +437,7 @@ def __init__( treelite_model=treelite_model, device="cpu", device_id=-1, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -449,7 +449,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - resource: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -461,7 +461,7 @@ def _create_with_layout( tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes) return cls( treelite_model=tl_model, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -532,7 +532,7 @@ def __init__( self, *, treelite_model: treelite.Model, - resource: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -545,7 +545,7 @@ def __init__( treelite_model=treelite_model, device="gpu", device_id=device_id, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -557,7 +557,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - resource: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -569,7 +569,7 @@ def _create_with_layout( tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes) return cls( treelite_model=tl_model, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -649,7 +649,7 @@ def __init__( self, *, treelite_model: treelite.Model, - resource: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -662,7 +662,7 @@ def __init__( treelite_model=treelite_model, device="gpu", device_id=device_id, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, @@ -674,7 +674,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - resource: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -686,7 +686,7 @@ def _create_with_layout( tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes) return cls( treelite_model=tl_model, - resource=resource, + handle=handle, layout=layout, default_chunk_size=default_chunk_size, align_bytes=align_bytes, diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py new file mode 100644 index 0000000..0633eee --- /dev/null +++ b/python/nvforest/nvforest/_handle.py @@ -0,0 +1,9 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +from pylibraft.common.handle import DeviceResources as RaftDeviceResources + +# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.DeviceResources +Handle = RaftDeviceResources diff --git a/python/nvforest/nvforest/detail/device_resources.pxd b/python/nvforest/nvforest/detail/device_resources.pxd deleted file mode 100644 index 8aa4aef..0000000 --- a/python/nvforest/nvforest/detail/device_resources.pxd +++ /dev/null @@ -1,20 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# - -from libcpp.memory cimport unique_ptr - -from nvforest.detail.raft_proto.cuda_stream cimport ( - cuda_stream as raft_proto_stream_t, -) - - -cdef extern from "nvforest/device_resources.hpp" namespace "nvforest" nogil: - cdef cppclass device_resources: - device_resources() except + - raft_proto_stream_t get_next_usable_stream() except + - void synchronize() except + - -cdef class DeviceResources: - cdef unique_ptr[device_resources] c_obj diff --git a/python/nvforest/nvforest/detail/device_resources.pyx b/python/nvforest/nvforest/detail/device_resources.pyx deleted file mode 100644 index 98ca786..0000000 --- a/python/nvforest/nvforest/detail/device_resources.pyx +++ /dev/null @@ -1,17 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# - -cdef class DeviceResources: - def __cinit__(self, c_obj=None): - self.c_obj.reset(new device_resources()) - - def __getstate__(self): - return object() - - def __setstate__(self, state): - self.c_obj.reset(new device_resources()) - - def get_c_obj(self): - return self.c_obj.get() diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx index 557d3ee..946c3fa 100644 --- a/python/nvforest/nvforest/detail/forest_inference.pyx +++ b/python/nvforest/nvforest/detail/forest_inference.pyx @@ -8,15 +8,15 @@ from typing import Optional, Union import numpy as np import treelite -from nvforest._device_resources import DeviceResources +from nvforest._handle import Handle from nvforest._typing import DataType from nvforest.detail.treelite import safe_treelite_call from cython.operator cimport dereference as deref from libc.stdint cimport uint32_t, uintptr_t from libcpp cimport bool +from rmm.librmm.cuda_stream_view cimport cuda_stream_view -from nvforest.detail.device_resources cimport device_resources from nvforest.detail.infer_kind cimport infer_kind from nvforest.detail.postprocessing cimport element_op, row_op from nvforest.detail.raft_proto.cuda_stream cimport ( @@ -34,6 +34,14 @@ from nvforest.detail.treelite cimport ( ) +cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil: + cdef cppclass device_resources: + device_resources() except + + cuda_stream_view get_next_usable_stream() except + + void sync_stream() except + + void sync_stream_pool() except + + + cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil: cdef cppclass forest_model: void predict[io_t]( @@ -69,13 +77,13 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil: cdef class ForestInference_impl(): cdef forest_model model - cdef object py_resource - cdef device_resources* c_resource + cdef object py_handle + cdef device_resources* c_handle cdef object device def __cinit__( self, - resource: object, + handle: object, tl_model_bytes: Union[bytes, bytearray], *, layout: str = "depth_first", @@ -84,8 +92,8 @@ cdef class ForestInference_impl(): device: str = "cpu", device_id: Optional[int] = None, ): - self.py_resource = resource - self.c_resource = self.py_resource.get_c_obj() + self.py_handle = handle + self.c_handle = self.py_handle.getHandle() cdef optional[bool] use_double_precision_c cdef bool use_double_precision_bool @@ -130,7 +138,7 @@ cdef class ForestInference_impl(): use_double_precision_c, dev_type, device_id, - self.c_resource.get_next_usable_stream() + self.c_handle.get_next_usable_stream().value() ) safe_treelite_call( @@ -240,7 +248,7 @@ cdef class ForestInference_impl(): if model_dtype == np.float32: self.model.predict[float]( - deref(self.c_resource), + deref(self.c_handle), out_ptr, in_ptr, n_rows, @@ -251,7 +259,7 @@ cdef class ForestInference_impl(): ) else: self.model.predict[double]( - deref(self.c_resource), + deref(self.c_handle), out_ptr, in_ptr, n_rows, @@ -262,7 +270,8 @@ cdef class ForestInference_impl(): ) if self.device == "gpu": - self.c_resource.synchronize() + self.c_handle.sync_stream_pool() + self.c_handle.sync_stream() return preds @@ -273,7 +282,7 @@ class ForestInferenceImpl: treelite_model: treelite.Model, device: str, device_id: int, - resource: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -281,7 +290,7 @@ class ForestInferenceImpl: ): # Assumption: The caller needs to pass in correct (device, device_id) pair # This function will not contain any logic for auto-detecting device. - self.resource = DeviceResources() if resource is None else resource + self.handle = Handle() if handle is None else handle self._layout = layout self.precision = precision self.default_chunk_size = default_chunk_size @@ -309,7 +318,7 @@ class ForestInferenceImpl: self._treelite_model_bytes = treelite_model.serialize_bytes() self.impl = ForestInference_impl( - self.resource, + self.handle, self._treelite_model_bytes, layout=self._layout, align_bytes=self.align_bytes, From 0c314cb421e5ec44b82e91c5a321a47625f7cf59 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 2 May 2026 01:20:07 -0700 Subject: [PATCH 18/35] Revert "Create device_resource wrapper in C++" This reverts commit 564e38c4f9900838a6cb8ea1f99c813e484075ec. --- cpp/include/nvforest/device_resources.hpp | 22 ---------------------- cpp/include/nvforest/forest_model.hpp | 15 ++++++++------- cpp/tests/treelite_importer.cpp | 5 +++-- 3 files changed, 11 insertions(+), 31 deletions(-) delete mode 100644 cpp/include/nvforest/device_resources.hpp diff --git a/cpp/include/nvforest/device_resources.hpp b/cpp/include/nvforest/device_resources.hpp deleted file mode 100644 index 04621d3..0000000 --- a/cpp/include/nvforest/device_resources.hpp +++ /dev/null @@ -1,22 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once - -#include - -namespace nvforest { - -class device_resources { - public: - device_resources() : res_{} {} - - auto get_next_usable_stream() const { return res_.get_next_usable_stream(); } - auto get_stream_pool_size() const { return res_.get_stream_pool_size(); } - - private: - raft::device_resources res_; -}; - -} // namespace nvforest diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 84987a8..be45584 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -7,9 +7,10 @@ #include #include #include -#include #include +#include + #include #include #include @@ -152,7 +153,7 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] resource device resource which will be used to provide + * @param[in] resource RAFT resource which will be used to provide * streams for evaluation. * @param[out] output The buffer where model output should be stored. If * this buffer is on host while the model is on device or vice versa, @@ -177,7 +178,7 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(nvforest::device_resources const& resource, + void predict(raft::device_resources const& resource, raft_proto::buffer& output, raft_proto::buffer const& input, infer_kind predict_type = infer_kind::default_kind, @@ -253,7 +254,7 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] resource device resource which will be used to provide + * @param[in] resource RAFT resource which will be used to provide * streams for evaluation. * @param[out] output Pointer to the memory location where output should end * up @@ -277,7 +278,7 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(nvforest::device_resources const& resource, + void predict(raft::device_resources const& resource, io_t* output, io_t* input, std::size_t num_rows, @@ -301,7 +302,7 @@ struct forest_model { } /** - * Perform inference on given input (with auto-instantiated device resource) + * Perform inference on given input (with auto-instantiated RAFT resource) * * @param[out] output Pointer to the memory location where output should end * up @@ -333,7 +334,7 @@ struct forest_model { infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { - auto resource = nvforest::device_resources{}; + auto resource = raft::device_resources{}; predict(resource, output, input, diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp index d79c423..f6f3413 100644 --- a/cpp/tests/treelite_importer.cpp +++ b/cpp/tests/treelite_importer.cpp @@ -4,11 +4,12 @@ */ #include -#include #include #include #include +#include + #include #include #include @@ -338,7 +339,7 @@ TEST(TreeliteImporter, DegenerateTree) auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); ASSERT_FALSE(fil_model.has_vector_leaves()); - auto resource = nvforest::device_resources{}; + auto resource = raft::device_resources{}; auto X = std::vector{0.0}; auto preds = std::vector(1, 0.0); auto expected_preds = std::vector{1.0}; From b3fd3be434c2d53e1a3fd27df1f39bb44b0fbc50 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 2 May 2026 02:32:24 -0700 Subject: [PATCH 19/35] Cache auto-instantiated RAFT resource --- cpp/include/nvforest/forest_model.hpp | 12 ++- cpp/tests/treelite_importer.cpp | 107 +++++++++++++------------- 2 files changed, 64 insertions(+), 55 deletions(-) diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index be45584..0fd75a4 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -26,7 +27,7 @@ namespace nvforest { struct forest_model { /** Wrap a decision_forest in a full forest_model object */ forest_model(decision_forest_variant&& forest = decision_forest_variant{}) - : decision_forest_{forest} + : decision_forest_{forest}, cached_device_resources_{} { } @@ -334,8 +335,11 @@ struct forest_model { infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { - auto resource = raft::device_resources{}; - predict(resource, + // Auto-instantiate RAFT resource and cache it + if (!cached_device_resources_) { + cached_device_resources_ = std::make_unique(); + } + predict(*cached_device_resources_, output, input, num_rows, @@ -347,6 +351,8 @@ struct forest_model { private: decision_forest_variant decision_forest_; + // Cache for auto-instantiated RAFT device resource + std::unique_ptr cached_device_resources_; }; } // namespace nvforest diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp index f6f3413..219ba2f 100644 --- a/cpp/tests/treelite_importer.cpp +++ b/cpp/tests/treelite_importer.cpp @@ -249,45 +249,45 @@ auto static const SAMPLE_FOREST = []() { TEST(TreeliteImporter, depth_first) { - auto fil_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::depth_first); - ASSERT_EQ(fil_model.num_features(), 7); - ASSERT_EQ(fil_model.num_outputs(), 1); - ASSERT_EQ(fil_model.num_trees(), 6); - ASSERT_FALSE(fil_model.has_vector_leaves()); - ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable); - ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable); - ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu); - ASSERT_EQ(fil_model.device_index(), -1); - ASSERT_FALSE(fil_model.is_double_precision()); + auto nvforest_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::depth_first); + ASSERT_EQ(nvforest_model.num_features(), 7); + ASSERT_EQ(nvforest_model.num_outputs(), 1); + ASSERT_EQ(nvforest_model.num_trees(), 6); + ASSERT_FALSE(nvforest_model.has_vector_leaves()); + ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable); + ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable); + ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu); + ASSERT_EQ(nvforest_model.device_index(), -1); + ASSERT_FALSE(nvforest_model.is_double_precision()); } TEST(TreeliteImporter, breadth_first) { - auto fil_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::breadth_first); - ASSERT_EQ(fil_model.num_features(), 7); - ASSERT_EQ(fil_model.num_outputs(), 1); - ASSERT_EQ(fil_model.num_trees(), 6); - ASSERT_FALSE(fil_model.has_vector_leaves()); - ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable); - ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable); - ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu); - ASSERT_EQ(fil_model.device_index(), -1); - ASSERT_FALSE(fil_model.is_double_precision()); + auto nvforest_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::breadth_first); + ASSERT_EQ(nvforest_model.num_features(), 7); + ASSERT_EQ(nvforest_model.num_outputs(), 1); + ASSERT_EQ(nvforest_model.num_trees(), 6); + ASSERT_FALSE(nvforest_model.has_vector_leaves()); + ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable); + ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable); + ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu); + ASSERT_EQ(nvforest_model.device_index(), -1); + ASSERT_FALSE(nvforest_model.is_double_precision()); } TEST(TreeliteImporter, layered_children_together) { - auto fil_model = + auto nvforest_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::layered_children_together); - ASSERT_EQ(fil_model.num_features(), 7); - ASSERT_EQ(fil_model.num_outputs(), 1); - ASSERT_EQ(fil_model.num_trees(), 6); - ASSERT_FALSE(fil_model.has_vector_leaves()); - ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable); - ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable); - ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu); - ASSERT_EQ(fil_model.device_index(), -1); - ASSERT_FALSE(fil_model.is_double_precision()); + ASSERT_EQ(nvforest_model.num_features(), 7); + ASSERT_EQ(nvforest_model.num_outputs(), 1); + ASSERT_EQ(nvforest_model.num_trees(), 6); + ASSERT_FALSE(nvforest_model.has_vector_leaves()); + ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable); + ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable); + ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu); + ASSERT_EQ(nvforest_model.device_index(), -1); + ASSERT_FALSE(nvforest_model.is_double_precision()); } template @@ -335,42 +335,45 @@ auto make_degenerate_tree(const leaf_t& leaf) TEST(TreeliteImporter, DegenerateTree) { - auto tl_model = make_degenerate_tree(1.0); - auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); - ASSERT_FALSE(fil_model.has_vector_leaves()); + auto tl_model = make_degenerate_tree(1.0); + auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); + ASSERT_FALSE(nvforest_model.has_vector_leaves()); auto resource = raft::device_resources{}; auto X = std::vector{0.0}; auto preds = std::vector(1, 0.0); auto expected_preds = std::vector{1.0}; - fil_model.predict(resource, - preds.data(), - X.data(), - 1, - raft_proto::device_type::cpu, - raft_proto::device_type::cpu, - nvforest::infer_kind::default_kind, - 1); + nvforest_model.predict(resource, + preds.data(), + X.data(), + 1, + raft_proto::device_type::cpu, + raft_proto::device_type::cpu, + nvforest::infer_kind::default_kind, + 1); ASSERT_EQ(preds, expected_preds); } TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) { - auto tl_model = make_degenerate_tree(std::vector{0.5, 0.5}); - auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); - ASSERT_TRUE(fil_model.has_vector_leaves()); + auto tl_model = make_degenerate_tree(std::vector{0.5, 0.5}); + auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); + ASSERT_TRUE(nvforest_model.has_vector_leaves()); auto X = std::vector{0.0}; auto preds = std::vector(2, 0.0); auto expected_preds = std::vector{0.5, 0.5}; - fil_model.predict(preds.data(), - X.data(), - 1, - raft_proto::device_type::cpu, - raft_proto::device_type::cpu, - nvforest::infer_kind::default_kind, - 1); - ASSERT_EQ(preds, expected_preds); + for (int i = 0; i < 3; ++i) { + // Make sure that auto-instantiated RAFT resource gets cached properly + nvforest_model.predict(preds.data(), + X.data(), + 1, + raft_proto::device_type::cpu, + raft_proto::device_type::cpu, + nvforest::infer_kind::default_kind, + 1); + ASSERT_EQ(preds, expected_preds); + } } } // namespace nvforest From a1f05d329effeb6e2d2d5b9252684d52beaf83fb Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 2 May 2026 02:33:47 -0700 Subject: [PATCH 20/35] Remove the note about RAFT resource --- docs/source/getting_started.rst | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 11d70d3..e94b666 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -202,24 +202,3 @@ Now that the tree model is fully imported into nvForest, let's run inference: fm.predict(output, input, num_rows, raft_proto::device_type::gpu, raft_proto::device_type::gpu, nvforest::infer_kind::default_kind); - -.. note:: Reuse the resource handle to reduce overhead - - nvForest internally creates a resource handle - (:cpp:class:`nvforest::device_resources`) - to manage GPU resources. Creation of the resource handle adds a slight - performance overhead. If you plan to call - :cpp:func:`~nvforest::forest_model::predict` multiple times, consider - creating the resource handle explicitly and re-using the handle between - the function calls. - - .. code-block:: cpp - - #include - - auto resource = nvforest::device_resources{}; - - // Calling predict multiple times ... - fm.predict(resource, output, input, num_rows, - raft_proto::device_type::gpu, raft_proto::device_type::gpu, - nvforest::infer_kind::default_kind); From 5fc3d6d69383a9e9ca8b62733cd0a4ae40940d37 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 2 May 2026 02:57:14 -0700 Subject: [PATCH 21/35] Add sync for auto-instantiated RAFT resource --- cpp/include/nvforest/forest_model.hpp | 6 ++++- cpp/tests/CMakeLists.txt | 2 +- ...lite_importer.cpp => treelite_importer.cu} | 25 +++++++++++++------ 3 files changed, 23 insertions(+), 10 deletions(-) rename cpp/tests/{treelite_importer.cpp => treelite_importer.cu} (93%) diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 0fd75a4..cb51a26 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -303,7 +303,9 @@ struct forest_model { } /** - * Perform inference on given input (with auto-instantiated RAFT resource) + * Perform inference on given input (with auto-instantiated RAFT resource). + * Note. This function is blocking and will synchronize the underlying RAFT + * resource at return time. * * @param[out] output Pointer to the memory location where output should end * up @@ -347,6 +349,8 @@ struct forest_model { in_mem_type, predict_type, specified_chunk_size); + cached_device_resources_->sync_stream_pool(); + cached_device_resources_->sync_stream(); } private: diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 18bef95..db7d7f8 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -84,6 +84,6 @@ ConfigureTest(NAME HOST_BUFFER_TEST raft_proto/buffer.cpp) ConfigureTest(NAME DEVICE_BUFFER_TEST raft_proto/buffer.cu) ConfigureTest(NAME FOREST_TRAVERSAL_TEST forest/traversal_forest.cpp) ConfigureTest(NAME TREELITE_TRAVERSAL_TEST forest/treelite_traversal.cpp) -ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cpp) +ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cu) rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libnvforest) diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cu similarity index 93% rename from cpp/tests/treelite_importer.cpp rename to cpp/tests/treelite_importer.cu index 219ba2f..69af84d 100644 --- a/cpp/tests/treelite_importer.cpp +++ b/cpp/tests/treelite_importer.cu @@ -10,6 +10,8 @@ #include +#include + #include #include #include @@ -357,22 +359,29 @@ TEST(TreeliteImporter, DegenerateTree) TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) { auto tl_model = make_degenerate_tree(std::vector{0.5, 0.5}); - auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); + auto nvforest_model = import_from_treelite_model(*tl_model, + tree_layout::breadth_first, + index_type{}, + std::nullopt, + raft_proto::device_type::gpu); ASSERT_TRUE(nvforest_model.has_vector_leaves()); - auto X = std::vector{0.0}; - auto preds = std::vector(2, 0.0); + auto X = thrust::device_vector{0.0}; + auto preds = thrust::device_vector(2, 0.0); + auto h_preds = std::vector(2); auto expected_preds = std::vector{0.5, 0.5}; for (int i = 0; i < 3; ++i) { // Make sure that auto-instantiated RAFT resource gets cached properly - nvforest_model.predict(preds.data(), - X.data(), + // For this interface, predict() will synchronize the stream automatically. + nvforest_model.predict(thrust::raw_pointer_cast(preds.data()), + thrust::raw_pointer_cast(X.data()), 1, - raft_proto::device_type::cpu, - raft_proto::device_type::cpu, + raft_proto::device_type::gpu, + raft_proto::device_type::gpu, nvforest::infer_kind::default_kind, 1); - ASSERT_EQ(preds, expected_preds); + thrust::copy(preds.begin(), preds.end(), h_preds.begin()); + ASSERT_EQ(h_preds, expected_preds); } } From 3be33b9340b3129894fa213ede7ec29c6c39d995 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 02:19:54 +0000 Subject: [PATCH 22/35] Fix RAFT resource stream conversion --- cpp/include/nvforest/forest_model.hpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index cef399c..9f5a951 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -196,8 +196,11 @@ struct forest_model { using model_io_t = typename std::remove_reference_t::io_type; if constexpr (std::is_same_v) { if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) { - concrete_forest.predict( - output, input, resource.get_next_usable_stream(), predict_type, specified_chunk_size); + concrete_forest.predict(output, + input, + get_next_raft_proto_stream(resource), + predict_type, + specified_chunk_size); } else { auto constexpr static const MIN_CHUNKS_PER_PARTITION = std::size_t{64}; auto constexpr static const MAX_CHUNK_SIZE = std::size_t{64}; @@ -209,7 +212,7 @@ struct forest_model { specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION); auto partition_count = raft_proto::ceildiv(row_count, partition_size); for (auto i = std::size_t{}; i < partition_count; ++i) { - auto stream = resource.get_next_usable_stream(); + auto stream = get_next_raft_proto_stream(resource); auto rows_in_this_partition = std::min(partition_size, row_count - i * partition_size); auto partition_in = raft_proto::buffer{}; @@ -363,6 +366,17 @@ struct forest_model { } private: + static raft_proto::cuda_stream get_next_raft_proto_stream( + raft::device_resources const& resource) + { +#ifdef NVFOREST_ENABLE_GPU + return resource.get_next_usable_stream().value(); +#else + (void)resource; + return raft_proto::cuda_stream{}; +#endif + } + decision_forest_variant decision_forest_; // Cache for auto-instantiated RAFT device resource std::unique_ptr cached_device_resources_; From bb6616ad73679aaa2ebd741e06ae235570dcb1c7 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 02:30:40 +0000 Subject: [PATCH 23/35] Move RAFT stream adapter into raft_proto --- .../nvforest/detail/raft_proto/resources.hpp | 23 +++++++++++++++++++ cpp/include/nvforest/forest_model.hpp | 18 +++------------ 2 files changed, 26 insertions(+), 15 deletions(-) create mode 100644 cpp/include/nvforest/detail/raft_proto/resources.hpp diff --git a/cpp/include/nvforest/detail/raft_proto/resources.hpp b/cpp/include/nvforest/detail/raft_proto/resources.hpp new file mode 100644 index 0000000..5ab952c --- /dev/null +++ b/cpp/include/nvforest/detail/raft_proto/resources.hpp @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include + +#include + +namespace raft_proto { + +inline cuda_stream get_next_usable_stream(raft::device_resources const& resource) +{ +#ifdef NVFOREST_ENABLE_GPU + return resource.get_next_usable_stream().value(); +#else + (void)resource; + return cuda_stream{}; +#endif +} + +} // namespace raft_proto diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 9f5a951..6692f75 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -8,10 +8,9 @@ #include #include #include +#include #include -#include - #ifdef NVFOREST_ENABLE_GPU #include #endif @@ -198,7 +197,7 @@ struct forest_model { if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) { concrete_forest.predict(output, input, - get_next_raft_proto_stream(resource), + raft_proto::get_next_usable_stream(resource), predict_type, specified_chunk_size); } else { @@ -212,7 +211,7 @@ struct forest_model { specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION); auto partition_count = raft_proto::ceildiv(row_count, partition_size); for (auto i = std::size_t{}; i < partition_count; ++i) { - auto stream = get_next_raft_proto_stream(resource); + auto stream = raft_proto::get_next_usable_stream(resource); auto rows_in_this_partition = std::min(partition_size, row_count - i * partition_size); auto partition_in = raft_proto::buffer{}; @@ -366,17 +365,6 @@ struct forest_model { } private: - static raft_proto::cuda_stream get_next_raft_proto_stream( - raft::device_resources const& resource) - { -#ifdef NVFOREST_ENABLE_GPU - return resource.get_next_usable_stream().value(); -#else - (void)resource; - return raft_proto::cuda_stream{}; -#endif - } - decision_forest_variant decision_forest_; // Cache for auto-instantiated RAFT device resource std::unique_ptr cached_device_resources_; From afd5cab2d9e84277172cfcb2a597aea5fc7c1dac Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 13:55:14 +0000 Subject: [PATCH 24/35] Update C++ README for device resources API --- cpp/include/nvforest/README.md | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md index 3e3df17..38eff4c 100644 --- a/cpp/include/nvforest/README.md +++ b/cpp/include/nvforest/README.md @@ -106,10 +106,7 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float)); // Assuming that input is a float* pointing to data already located on-device -auto handle = raft_proto::handle_t{}; - nvforest_model.predict( - handle, output, input, num_rows, @@ -119,11 +116,10 @@ nvforest_model.predict( ); ``` -**handle**: To provide a unified interface on CPU and GPU, we introduce -`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a -placeholder in CPU-only builds, and using it does not require any CUDA -functionality. For GPU-enabled builds, you can construct a -`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use. +The overload shown above auto-instantiates and caches a `raft::device_resources` +object. It synchronizes the resource before returning. Applications that need to +control stream or stream-pool usage can instead pass an existing +`raft::device_resources` object as the first argument. **output**: Pointer to pre-allocated buffer where results should be written. If the model has been loaded at single precision, this should be a From 5af8cac4407b2a96dcd32481ecd62f2cf3c5ff96 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 13:55:19 +0000 Subject: [PATCH 25/35] Remove stale Treelite importer handle include --- cpp/tests/treelite_importer.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/tests/treelite_importer.cu b/cpp/tests/treelite_importer.cu index 307e000..69af84d 100644 --- a/cpp/tests/treelite_importer.cu +++ b/cpp/tests/treelite_importer.cu @@ -4,7 +4,6 @@ */ #include -#include #include #include #include From a2111d29e619884583355d44577d72fa7542870b Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 14:23:50 +0000 Subject: [PATCH 26/35] Reject null prediction pointers --- cpp/include/nvforest/forest_model.hpp | 4 +++ cpp/tests/treelite_importer.cu | 38 +++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 6692f75..2d4ecff 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -295,6 +295,10 @@ struct forest_model { infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { + if (num_rows != 0 && (output == nullptr || input == nullptr)) { + throw runtime_error{"Input and output pointers must be non-null when num_rows > 0"}; + } + int current_device_id; if (out_mem_type == raft_proto::device_type::gpu || in_mem_type == raft_proto::device_type::gpu) { diff --git a/cpp/tests/treelite_importer.cu b/cpp/tests/treelite_importer.cu index 69af84d..ab18453 100644 --- a/cpp/tests/treelite_importer.cu +++ b/cpp/tests/treelite_importer.cu @@ -356,6 +356,44 @@ TEST(TreeliteImporter, DegenerateTree) ASSERT_EQ(preds, expected_preds); } +TEST(TreeliteImporter, RejectNullPointers) +{ + auto tl_model = make_degenerate_tree(1.0); + auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); + + auto resource = raft::device_resources{}; + auto X = std::vector{0.0}; + auto preds = std::vector(1, 0.0); + auto* null_pointer = static_cast(nullptr); + + EXPECT_THROW(nvforest_model.predict(resource, + null_pointer, + X.data(), + 1, + raft_proto::device_type::cpu, + raft_proto::device_type::cpu, + nvforest::infer_kind::default_kind, + 1), + nvforest::runtime_error); + EXPECT_THROW(nvforest_model.predict(resource, + preds.data(), + null_pointer, + 1, + raft_proto::device_type::cpu, + raft_proto::device_type::cpu, + nvforest::infer_kind::default_kind, + 1), + nvforest::runtime_error); + EXPECT_THROW(nvforest_model.predict(null_pointer, + X.data(), + 1, + raft_proto::device_type::cpu, + raft_proto::device_type::cpu, + nvforest::infer_kind::default_kind, + 1), + nvforest::runtime_error); +} + TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) { auto tl_model = make_degenerate_tree(std::vector{0.5, 0.5}); From 6e2cc4275461c4623718d53cef6f65252d3cb3c0 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 14:23:58 +0000 Subject: [PATCH 27/35] Document device resources migration path --- docs/source/getting_started.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index e94b666..c3fce83 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -202,3 +202,17 @@ Now that the tree model is fully imported into nvForest, let's run inference: fm.predict(output, input, num_rows, raft_proto::device_type::gpu, raft_proto::device_type::gpu, nvforest::infer_kind::default_kind); + +The overload shown above auto-instantiates and caches a ``raft::device_resources`` +object and synchronizes before returning. Code that previously constructed a +``raft_proto::handle_t`` should instead pass a ``raft::device_resources`` object +directly when it needs to control CUDA stream or stream-pool usage: + +.. code-block:: cpp + + #include + + raft::device_resources resource{}; + fm.predict(resource, output, input, num_rows, + raft_proto::device_type::gpu, raft_proto::device_type::gpu, + nvforest::infer_kind::default_kind); From 902c2ea5f2bdba4313796944ed30a9a4824342b1 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 14:24:08 +0000 Subject: [PATCH 28/35] Clarify auto-resource test coverage --- cpp/tests/treelite_importer.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/treelite_importer.cu b/cpp/tests/treelite_importer.cu index ab18453..71cd18e 100644 --- a/cpp/tests/treelite_importer.cu +++ b/cpp/tests/treelite_importer.cu @@ -409,8 +409,8 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) auto h_preds = std::vector(2); auto expected_preds = std::vector{0.5, 0.5}; for (int i = 0; i < 3; ++i) { - // Make sure that auto-instantiated RAFT resource gets cached properly - // For this interface, predict() will synchronize the stream automatically. + // Repeatedly exercise the auto-resource overload. This interface + // synchronizes the stream before returning. nvforest_model.predict(thrust::raw_pointer_cast(preds.data()), thrust::raw_pointer_cast(X.data()), 1, From 779f8c23187e8796f05b41f8ca59e7203014b0c9 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 14:31:36 +0000 Subject: [PATCH 29/35] Clarify device resources usage docs --- docs/source/getting_started.rst | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index c3fce83..46b1b27 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -204,9 +204,9 @@ Now that the tree model is fully imported into nvForest, let's run inference: nvforest::infer_kind::default_kind); The overload shown above auto-instantiates and caches a ``raft::device_resources`` -object and synchronizes before returning. Code that previously constructed a -``raft_proto::handle_t`` should instead pass a ``raft::device_resources`` object -directly when it needs to control CUDA stream or stream-pool usage: +object and synchronizes before returning. This is the recommended path unless +your application needs to control CUDA stream or stream-pool usage directly. For +that advanced use case, pass an explicit ``raft::device_resources`` object: .. code-block:: cpp @@ -216,3 +216,14 @@ directly when it needs to control CUDA stream or stream-pool usage: fm.predict(resource, output, input, num_rows, raft_proto::device_type::gpu, raft_proto::device_type::gpu, nvforest::infer_kind::default_kind); + +.. note:: + + In version 26.06, the C++ prediction API changed from ``raft_proto::handle_t`` + to ``raft::device_resources``. Code that previously constructed a + ``raft_proto::handle_t`` should use the no-resource overload shown above, or + if the previous call site relied on explicit CUDA stream or stream-pool + control, replace calls of the form + ``fm.predict(handle, output, input, num_rows, ...)`` with + ``fm.predict(resource, output, input, num_rows, ...)`` using + ``raft::device_resources``. From 397835f7a570c67d8e0e32ef8d7811fb9852705d Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 15:05:46 +0000 Subject: [PATCH 30/35] Document RAFT resource migration debt --- cpp/include/nvforest/detail/raft_proto/resources.hpp | 4 ++++ cpp/include/nvforest/forest_model.hpp | 3 +++ 2 files changed, 7 insertions(+) diff --git a/cpp/include/nvforest/detail/raft_proto/resources.hpp b/cpp/include/nvforest/detail/raft_proto/resources.hpp index 5ab952c..091acc8 100644 --- a/cpp/include/nvforest/detail/raft_proto/resources.hpp +++ b/cpp/include/nvforest/detail/raft_proto/resources.hpp @@ -10,6 +10,10 @@ namespace raft_proto { +// TODO(nvforest#121): This is a compatibility bridge while nvForest accepts +// raft::device_resources publicly but still uses raft_proto::cuda_stream +// internally. Remove this once internal resource/stream handling is migrated to +// RAFT/RMM abstractions consistently. inline cuda_stream get_next_usable_stream(raft::device_resources const& resource) { #ifdef NVFOREST_ENABLE_GPU diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 2d4ecff..1c7f2a5 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -134,6 +134,9 @@ struct forest_model { * optimal value a priori. If omitted, a heuristic will be used to select a * reasonable value. On CPU, this argument can generally just be omitted. */ + // TODO(nvforest#121): This lower-level overload still exposes + // raft_proto::cuda_stream. Revisit it as part of the internal migration to + // consistent RAFT/RMM resource and stream abstractions. template void predict(raft_proto::buffer& output, raft_proto::buffer const& input, From 5f344d0726239e116c5ba86d332a4ba039867358 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 15:09:20 +0000 Subject: [PATCH 31/35] Deprecate Python Handle alias --- python/nvforest/nvforest/__init__.py | 17 +++++++++- python/nvforest/nvforest/_factory.py | 34 +++++++++---------- python/nvforest/nvforest/_forest_inference.py | 20 +++++------ python/nvforest/nvforest/_handle.py | 19 +++++++++-- .../nvforest/detail/forest_inference.pyx | 6 ++-- 5 files changed, 63 insertions(+), 33 deletions(-) diff --git a/python/nvforest/nvforest/__init__.py b/python/nvforest/nvforest/__init__.py index 4fc9b15..4fc3ff2 100644 --- a/python/nvforest/nvforest/__init__.py +++ b/python/nvforest/nvforest/__init__.py @@ -24,12 +24,13 @@ GPUForestInferenceClassifier, GPUForestInferenceRegressor, ) -from nvforest._handle import Handle +from nvforest._handle import DeviceResources from nvforest._version import __git_commit__, __version__ __all__ = [ "CPUForestInferenceClassifier", "CPUForestInferenceRegressor", + "DeviceResources", "GPUForestInferenceClassifier", "GPUForestInferenceRegressor", "Handle", @@ -39,3 +40,17 @@ "__git_commit__", "__version__", ] + + +def __getattr__(name): + if name == "Handle": + import warnings + + warnings.warn( + "nvforest.Handle was renamed to nvforest.DeviceResources in 26.06 " + "and will be removed in 26.08.", + FutureWarning, + stacklevel=2, + ) + return DeviceResources + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/python/nvforest/nvforest/_factory.py b/python/nvforest/nvforest/_factory.py index cc0700a..4943b91 100644 --- a/python/nvforest/nvforest/_factory.py +++ b/python/nvforest/nvforest/_factory.py @@ -17,7 +17,7 @@ infer_device, infer_is_classifier, ) -from nvforest._handle import Handle +from nvforest._handle import DeviceResources def get_forest_inference_class(device, is_classifier) -> type: @@ -39,7 +39,7 @@ def make_forest_inference_object( treelite_model: treelite.Model, device: str, device_id: Optional[int], - handle: Optional[Handle], + handle: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -72,7 +72,7 @@ def load_model( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - handle: Optional[Handle] = None, + handle: Optional[DeviceResources] = None, ) -> ForestInference: """Load a model into nvForest from a serialized model file. @@ -113,10 +113,10 @@ def load_model( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - handle : nvforest.Handle or None - For GPU execution, the nvForest handle containing the stream or stream - pool to use during loading and inference. If not given, a new - handle will be constructed. + handle : nvforest.DeviceResources or None + For GPU execution, the nvForest device resources containing the stream + or stream pool to use during loading and inference. If not given, a new + device resources object will be constructed. """ model_path = pathlib.Path(model_file) if not model_path.exists(): @@ -174,7 +174,7 @@ def load_from_sklearn( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - handle: Optional[Handle] = None, + handle: Optional[DeviceResources] = None, ) -> ForestInference: """Load a Scikit-Learn forest model to nvForest @@ -208,10 +208,10 @@ def load_from_sklearn( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - handle : nvforest.Handle or None - For GPU execution, the nvForest handle containing the stream or stream - pool to use during loading and inference. If not given, a new - handle will be constructed. + handle : nvforest.DeviceResources or None + For GPU execution, the nvForest device resources containing the stream + or stream pool to use during loading and inference. If not given, a new + device resources object will be constructed. """ tl_model = treelite.sklearn.import_model(skl_model) @@ -236,7 +236,7 @@ def load_from_treelite_model( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - handle: Optional[Handle] = None, + handle: Optional[DeviceResources] = None, ) -> ForestInference: """Load a Treelite forest model to nvForest @@ -270,10 +270,10 @@ def load_from_treelite_model( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - handle : nvforest.Handle or None - For GPU execution, the nvForest handle containing the stream or stream - pool to use during loading and inference. If not given, a new - handle will be constructed. + handle : nvforest.DeviceResources or None + For GPU execution, the nvForest device resources containing the stream + or stream pool to use during loading and inference. If not given, a new + device resources object will be constructed. """ return make_forest_inference_object( treelite_model=tl_model, diff --git a/python/nvforest/nvforest/_forest_inference.py b/python/nvforest/nvforest/_forest_inference.py index 2c3c933..a7602ae 100644 --- a/python/nvforest/nvforest/_forest_inference.py +++ b/python/nvforest/nvforest/_forest_inference.py @@ -18,7 +18,7 @@ from cuda.bindings import runtime from nvforest._base import ForestInferenceClassifier, ForestInferenceRegressor -from nvforest._handle import Handle +from nvforest._handle import DeviceResources from nvforest._typing import DataType from nvforest.detail.forest_inference import ForestInferenceImpl @@ -123,7 +123,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + handle: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -310,7 +310,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[Handle] = None, + handle: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -334,7 +334,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + handle: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -425,7 +425,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[Handle] = None, + handle: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -449,7 +449,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + handle: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -532,7 +532,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[Handle] = None, + handle: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -557,7 +557,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + handle: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -649,7 +649,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[Handle] = None, + handle: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -674,7 +674,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[Handle], + handle: Optional[DeviceResources], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py index 0633eee..878c48a 100644 --- a/python/nvforest/nvforest/_handle.py +++ b/python/nvforest/nvforest/_handle.py @@ -3,7 +3,22 @@ # SPDX-License-Identifier: Apache-2.0 # +import warnings + from pylibraft.common.handle import DeviceResources as RaftDeviceResources -# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.DeviceResources -Handle = RaftDeviceResources +DeviceResources = RaftDeviceResources + +__all__ = ["DeviceResources", "Handle"] + + +def __getattr__(name): + if name == "Handle": + warnings.warn( + "nvforest.Handle was renamed to nvforest.DeviceResources in 26.06 " + "and will be removed in 26.08.", + FutureWarning, + stacklevel=2, + ) + return DeviceResources + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx index 946c3fa..3a513b6 100644 --- a/python/nvforest/nvforest/detail/forest_inference.pyx +++ b/python/nvforest/nvforest/detail/forest_inference.pyx @@ -8,7 +8,7 @@ from typing import Optional, Union import numpy as np import treelite -from nvforest._handle import Handle +from nvforest._handle import DeviceResources from nvforest._typing import DataType from nvforest.detail.treelite import safe_treelite_call @@ -282,7 +282,7 @@ class ForestInferenceImpl: treelite_model: treelite.Model, device: str, device_id: int, - handle: Optional[Handle] = None, + handle: Optional[DeviceResources] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -290,7 +290,7 @@ class ForestInferenceImpl: ): # Assumption: The caller needs to pass in correct (device, device_id) pair # This function will not contain any logic for auto-detecting device. - self.handle = Handle() if handle is None else handle + self.handle = DeviceResources() if handle is None else handle self._layout = layout self.precision = precision self.default_chunk_size = default_chunk_size From 121c40059abd74e0f4799a0b2eb3e42cf788797d Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 15:37:24 +0000 Subject: [PATCH 32/35] Build nvforest with C++20 --- cpp/CMakeLists.txt | 4 ++-- cpp/tests/CMakeLists.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fb9d015..712d0a3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -257,9 +257,9 @@ set_target_properties( PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" # set target compile options - CXX_STANDARD 17 + CXX_STANDARD 20 CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 17 + CUDA_STANDARD 20 CUDA_STANDARD_REQUIRED ON POSITION_INDEPENDENT_CODE ON INTERFACE_POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index db7d7f8..4f4b256 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -62,9 +62,9 @@ function(ConfigureTest) set_target_properties( ${_NVFOREST_TEST_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib" - CXX_STANDARD 17 + CXX_STANDARD 20 CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 17 + CUDA_STANDARD 20 CUDA_STANDARD_REQUIRED ON) set(_NVFOREST_TEST_COMPONENT_NAME testing) From 10499f25661b8ea3440a2251e966e2957734fd15 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 15:57:57 +0000 Subject: [PATCH 33/35] Reduce PR 102 to auto-instantiation --- cpp/CMakeLists.txt | 4 +- cpp/include/nvforest/README.md | 12 +- .../nvforest/detail/raft_proto/handle.hpp | 43 +++++ .../nvforest/detail/raft_proto/resources.hpp | 27 --- cpp/include/nvforest/forest_model.hpp | 85 +++------ cpp/tests/CMakeLists.txt | 6 +- ...lite_importer.cu => treelite_importer.cpp} | 168 +++++++----------- docs/source/getting_started.rst | 33 +--- python/nvforest/nvforest/__init__.py | 17 +- python/nvforest/nvforest/_factory.py | 34 ++-- python/nvforest/nvforest/_forest_inference.py | 20 +-- python/nvforest/nvforest/_handle.py | 21 +-- .../nvforest/detail/forest_inference.pyx | 43 ++--- .../nvforest/detail/raft_proto/handle.pxd | 19 ++ 14 files changed, 218 insertions(+), 314 deletions(-) create mode 100644 cpp/include/nvforest/detail/raft_proto/handle.hpp delete mode 100644 cpp/include/nvforest/detail/raft_proto/resources.hpp rename cpp/tests/{treelite_importer.cu => treelite_importer.cpp} (65%) create mode 100644 python/nvforest/nvforest/detail/raft_proto/handle.pxd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 712d0a3..fb9d015 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -257,9 +257,9 @@ set_target_properties( PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" # set target compile options - CXX_STANDARD 20 + CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 20 + CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON POSITION_INDEPENDENT_CODE ON INTERFACE_POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md index 38eff4c..3e3df17 100644 --- a/cpp/include/nvforest/README.md +++ b/cpp/include/nvforest/README.md @@ -106,7 +106,10 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float)); // Assuming that input is a float* pointing to data already located on-device +auto handle = raft_proto::handle_t{}; + nvforest_model.predict( + handle, output, input, num_rows, @@ -116,10 +119,11 @@ nvforest_model.predict( ); ``` -The overload shown above auto-instantiates and caches a `raft::device_resources` -object. It synchronizes the resource before returning. Applications that need to -control stream or stream-pool usage can instead pass an existing -`raft::device_resources` object as the first argument. +**handle**: To provide a unified interface on CPU and GPU, we introduce +`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a +placeholder in CPU-only builds, and using it does not require any CUDA +functionality. For GPU-enabled builds, you can construct a +`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use. **output**: Pointer to pre-allocated buffer where results should be written. If the model has been loaded at single precision, this should be a diff --git a/cpp/include/nvforest/detail/raft_proto/handle.hpp b/cpp/include/nvforest/detail/raft_proto/handle.hpp new file mode 100644 index 0000000..086d61b --- /dev/null +++ b/cpp/include/nvforest/detail/raft_proto/handle.hpp @@ -0,0 +1,43 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +#include + +#include +#include +#ifdef NVFOREST_ENABLE_GPU +#include +#endif + +namespace raft_proto { +#ifdef NVFOREST_ENABLE_GPU +struct handle_t { + handle_t(raft::handle_t const* handle_ptr = nullptr) : raft_handle_{handle_ptr} {} + handle_t(raft::handle_t const& raft_handle) : raft_handle_{&raft_handle} {} + auto get_next_usable_stream() const + { + return raft_proto::cuda_stream{raft_handle_->get_next_usable_stream().value()}; + } + auto get_stream_pool_size() const { return raft_handle_->get_stream_pool_size(); } + auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); } + void synchronize() const + { + raft_handle_->sync_stream_pool(); + raft_handle_->sync_stream(); + } + + private: + // Have to store a pointer because handle is not movable + raft::handle_t const* raft_handle_; +}; +#else +struct handle_t { + auto get_next_usable_stream() const { return raft_proto::cuda_stream{}; } + auto get_stream_pool_size() const { return std::size_t{}; } + auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); } + void synchronize() const {} +}; +#endif +} // namespace raft_proto diff --git a/cpp/include/nvforest/detail/raft_proto/resources.hpp b/cpp/include/nvforest/detail/raft_proto/resources.hpp deleted file mode 100644 index 091acc8..0000000 --- a/cpp/include/nvforest/detail/raft_proto/resources.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once - -#include - -#include - -namespace raft_proto { - -// TODO(nvforest#121): This is a compatibility bridge while nvForest accepts -// raft::device_resources publicly but still uses raft_proto::cuda_stream -// internally. Remove this once internal resource/stream handling is migrated to -// RAFT/RMM abstractions consistently. -inline cuda_stream get_next_usable_stream(raft::device_resources const& resource) -{ -#ifdef NVFOREST_ENABLE_GPU - return resource.get_next_usable_stream().value(); -#else - (void)resource; - return cuda_stream{}; -#endif -} - -} // namespace raft_proto diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 1c7f2a5..12c4738 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #ifdef NVFOREST_ENABLE_GPU @@ -16,7 +16,6 @@ #endif #include -#include #include #include @@ -31,7 +30,7 @@ namespace nvforest { struct forest_model { /** Wrap a decision_forest in a full forest_model object */ forest_model(decision_forest_variant&& forest = decision_forest_variant{}) - : decision_forest_{forest}, cached_device_resources_{} + : decision_forest_{forest} { } @@ -134,9 +133,6 @@ struct forest_model { * optimal value a priori. If omitted, a heuristic will be used to select a * reasonable value. On CPU, this argument can generally just be omitted. */ - // TODO(nvforest#121): This lower-level overload still exposes - // raft_proto::cuda_stream. Revisit it as part of the internal migration to - // consistent RAFT/RMM resource and stream abstractions. template void predict(raft_proto::buffer& output, raft_proto::buffer const& input, @@ -161,8 +157,8 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] resource RAFT resource which will be used to provide - * streams for evaluation. + * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t + * on GPU) which will be used to provide streams for evaluation. * @param[out] output The buffer where model output should be stored. If * this buffer is on host while the model is on device or vice versa, * work will be distributed across available streams to copy the data back @@ -186,35 +182,31 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(raft::device_resources const& resource, + void predict(raft_proto::handle_t const& handle, raft_proto::buffer& output, raft_proto::buffer const& input, infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { std::visit( - [this, predict_type, &resource, &output, &input, &specified_chunk_size]( + [this, predict_type, &handle, &output, &input, &specified_chunk_size]( auto&& concrete_forest) { using model_io_t = typename std::remove_reference_t::io_type; if constexpr (std::is_same_v) { if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) { - concrete_forest.predict(output, - input, - raft_proto::get_next_usable_stream(resource), - predict_type, - specified_chunk_size); + concrete_forest.predict( + output, input, handle.get_next_usable_stream(), predict_type, specified_chunk_size); } else { auto constexpr static const MIN_CHUNKS_PER_PARTITION = std::size_t{64}; auto constexpr static const MAX_CHUNK_SIZE = std::size_t{64}; - auto row_count = input.size() / num_features(); - auto usable_stream_count = std::max(resource.get_stream_pool_size(), std::size_t{1}); + auto row_count = input.size() / num_features(); auto partition_size = - std::max(raft_proto::ceildiv(row_count, usable_stream_count), + std::max(raft_proto::ceildiv(row_count, handle.get_usable_stream_count()), specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION); auto partition_count = raft_proto::ceildiv(row_count, partition_size); for (auto i = std::size_t{}; i < partition_count; ++i) { - auto stream = raft_proto::get_next_usable_stream(resource); + auto stream = handle.get_next_usable_stream(); auto rows_in_this_partition = std::min(partition_size, row_count - i * partition_size); auto partition_in = raft_proto::buffer{}; @@ -265,8 +257,8 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] resource RAFT resource which will be used to provide - * streams for evaluation. + * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t + * on GPU) which will be used to provide streams for evaluation. * @param[out] output Pointer to the memory location where output should end * up * @param[in] input Pointer to the input data @@ -289,7 +281,7 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(raft::device_resources const& resource, + void predict(raft_proto::handle_t const& handle, io_t* output, io_t* input, std::size_t num_rows, @@ -298,10 +290,6 @@ struct forest_model { infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { - if (num_rows != 0 && (output == nullptr || input == nullptr)) { - throw runtime_error{"Input and output pointers must be non-null when num_rows > 0"}; - } - int current_device_id; if (out_mem_type == raft_proto::device_type::gpu || in_mem_type == raft_proto::device_type::gpu) { @@ -317,34 +305,12 @@ struct forest_model { raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id}; auto in_buffer = raft_proto::buffer{input, num_rows * num_features(), in_mem_type, current_device_id}; - predict(resource, out_buffer, in_buffer, predict_type, specified_chunk_size); + predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size); } /** - * Perform inference on given input (with auto-instantiated RAFT resource). - * Note. This function is blocking and will synchronize the underlying RAFT - * resource at return time. - * - * @param[out] output Pointer to the memory location where output should end - * up - * @param[in] input Pointer to the input data - * @param[in] num_rows Number of rows in input - * @param[in] out_mem_type The memory type (device/host) of the output - * buffer - * @param[in] in_mem_type The memory type (device/host) of the input buffer - * @param[in] predict_type Type of inference to perform. Defaults to summing - * the outputs of all trees and produce an output per row. If set to - * "per_tree", we will instead output all outputs of individual trees. - * If set to "leaf_id", we will output the integer ID of the leaf node - * for each tree. - * @param[in] specified_chunk_size: Specifies the mini-batch size for - * processing. This has different meanings on CPU and GPU, but on GPU it - * corresponds to the number of rows evaluated per inference iteration - * on a single block. It can take on any power of 2 from 1 to 32, and - * runtime performance is quite sensitive to the value chosen. In general, - * larger batches benefit from higher values, but it is hard to predict the - * optimal value a priori. If omitted, a heuristic will be used to select a - * reasonable value. On CPU, this argument can generally just be omitted. + * Perform inference on given input using an internally managed RAFT handle. + * This function is blocking and synchronizes the handle before returning. */ template void predict(io_t* output, @@ -355,11 +321,13 @@ struct forest_model { infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { - // Auto-instantiate RAFT resource and cache it - if (!cached_device_resources_) { - cached_device_resources_ = std::make_unique(); - } - predict(*cached_device_resources_, +#ifdef NVFOREST_ENABLE_GPU + auto raft_handle = raft::handle_t{}; + auto handle = raft_proto::handle_t{raft_handle}; +#else + auto handle = raft_proto::handle_t{}; +#endif + predict(handle, output, input, num_rows, @@ -367,14 +335,11 @@ struct forest_model { in_mem_type, predict_type, specified_chunk_size); - cached_device_resources_->sync_stream_pool(); - cached_device_resources_->sync_stream(); + handle.synchronize(); } private: decision_forest_variant decision_forest_; - // Cache for auto-instantiated RAFT device resource - std::unique_ptr cached_device_resources_; }; } // namespace nvforest diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4f4b256..18bef95 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -62,9 +62,9 @@ function(ConfigureTest) set_target_properties( ${_NVFOREST_TEST_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib" - CXX_STANDARD 20 + CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 20 + CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON) set(_NVFOREST_TEST_COMPONENT_NAME testing) @@ -84,6 +84,6 @@ ConfigureTest(NAME HOST_BUFFER_TEST raft_proto/buffer.cpp) ConfigureTest(NAME DEVICE_BUFFER_TEST raft_proto/buffer.cu) ConfigureTest(NAME FOREST_TRAVERSAL_TEST forest/traversal_forest.cpp) ConfigureTest(NAME TREELITE_TRAVERSAL_TEST forest/treelite_traversal.cpp) -ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cu) +ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cpp) rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libnvforest) diff --git a/cpp/tests/treelite_importer.cu b/cpp/tests/treelite_importer.cpp similarity index 65% rename from cpp/tests/treelite_importer.cu rename to cpp/tests/treelite_importer.cpp index 71cd18e..0eee0db 100644 --- a/cpp/tests/treelite_importer.cu +++ b/cpp/tests/treelite_importer.cpp @@ -4,14 +4,11 @@ */ #include +#include #include #include #include -#include - -#include - #include #include #include @@ -251,45 +248,45 @@ auto static const SAMPLE_FOREST = []() { TEST(TreeliteImporter, depth_first) { - auto nvforest_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::depth_first); - ASSERT_EQ(nvforest_model.num_features(), 7); - ASSERT_EQ(nvforest_model.num_outputs(), 1); - ASSERT_EQ(nvforest_model.num_trees(), 6); - ASSERT_FALSE(nvforest_model.has_vector_leaves()); - ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable); - ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable); - ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu); - ASSERT_EQ(nvforest_model.device_index(), -1); - ASSERT_FALSE(nvforest_model.is_double_precision()); + auto fil_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::depth_first); + ASSERT_EQ(fil_model.num_features(), 7); + ASSERT_EQ(fil_model.num_outputs(), 1); + ASSERT_EQ(fil_model.num_trees(), 6); + ASSERT_FALSE(fil_model.has_vector_leaves()); + ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable); + ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable); + ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu); + ASSERT_EQ(fil_model.device_index(), -1); + ASSERT_FALSE(fil_model.is_double_precision()); } TEST(TreeliteImporter, breadth_first) { - auto nvforest_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::breadth_first); - ASSERT_EQ(nvforest_model.num_features(), 7); - ASSERT_EQ(nvforest_model.num_outputs(), 1); - ASSERT_EQ(nvforest_model.num_trees(), 6); - ASSERT_FALSE(nvforest_model.has_vector_leaves()); - ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable); - ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable); - ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu); - ASSERT_EQ(nvforest_model.device_index(), -1); - ASSERT_FALSE(nvforest_model.is_double_precision()); + auto fil_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::breadth_first); + ASSERT_EQ(fil_model.num_features(), 7); + ASSERT_EQ(fil_model.num_outputs(), 1); + ASSERT_EQ(fil_model.num_trees(), 6); + ASSERT_FALSE(fil_model.has_vector_leaves()); + ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable); + ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable); + ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu); + ASSERT_EQ(fil_model.device_index(), -1); + ASSERT_FALSE(fil_model.is_double_precision()); } TEST(TreeliteImporter, layered_children_together) { - auto nvforest_model = + auto fil_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::layered_children_together); - ASSERT_EQ(nvforest_model.num_features(), 7); - ASSERT_EQ(nvforest_model.num_outputs(), 1); - ASSERT_EQ(nvforest_model.num_trees(), 6); - ASSERT_FALSE(nvforest_model.has_vector_leaves()); - ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable); - ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable); - ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu); - ASSERT_EQ(nvforest_model.device_index(), -1); - ASSERT_FALSE(nvforest_model.is_double_precision()); + ASSERT_EQ(fil_model.num_features(), 7); + ASSERT_EQ(fil_model.num_outputs(), 1); + ASSERT_EQ(fil_model.num_trees(), 6); + ASSERT_FALSE(fil_model.has_vector_leaves()); + ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable); + ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable); + ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu); + ASSERT_EQ(fil_model.device_index(), -1); + ASSERT_FALSE(fil_model.is_double_precision()); } template @@ -337,90 +334,47 @@ auto make_degenerate_tree(const leaf_t& leaf) TEST(TreeliteImporter, DegenerateTree) { - auto tl_model = make_degenerate_tree(1.0); - auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); - ASSERT_FALSE(nvforest_model.has_vector_leaves()); + auto tl_model = make_degenerate_tree(1.0); + auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); + ASSERT_FALSE(fil_model.has_vector_leaves()); - auto resource = raft::device_resources{}; +#ifdef NVFOREST_ENABLE_GPU + auto raft_handle = raft::handle_t{}; + auto handle = raft_proto::handle_t{raft_handle}; +#else + auto handle = raft_proto::handle_t{}; +#endif auto X = std::vector{0.0}; auto preds = std::vector(1, 0.0); auto expected_preds = std::vector{1.0}; - nvforest_model.predict(resource, - preds.data(), - X.data(), - 1, - raft_proto::device_type::cpu, - raft_proto::device_type::cpu, - nvforest::infer_kind::default_kind, - 1); + fil_model.predict(handle, + preds.data(), + X.data(), + 1, + raft_proto::device_type::cpu, + raft_proto::device_type::cpu, + nvforest::infer_kind::default_kind, + 1); ASSERT_EQ(preds, expected_preds); } -TEST(TreeliteImporter, RejectNullPointers) -{ - auto tl_model = make_degenerate_tree(1.0); - auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); - - auto resource = raft::device_resources{}; - auto X = std::vector{0.0}; - auto preds = std::vector(1, 0.0); - auto* null_pointer = static_cast(nullptr); - - EXPECT_THROW(nvforest_model.predict(resource, - null_pointer, - X.data(), - 1, - raft_proto::device_type::cpu, - raft_proto::device_type::cpu, - nvforest::infer_kind::default_kind, - 1), - nvforest::runtime_error); - EXPECT_THROW(nvforest_model.predict(resource, - preds.data(), - null_pointer, - 1, - raft_proto::device_type::cpu, - raft_proto::device_type::cpu, - nvforest::infer_kind::default_kind, - 1), - nvforest::runtime_error); - EXPECT_THROW(nvforest_model.predict(null_pointer, - X.data(), - 1, - raft_proto::device_type::cpu, - raft_proto::device_type::cpu, - nvforest::infer_kind::default_kind, - 1), - nvforest::runtime_error); -} - TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) { - auto tl_model = make_degenerate_tree(std::vector{0.5, 0.5}); - auto nvforest_model = import_from_treelite_model(*tl_model, - tree_layout::breadth_first, - index_type{}, - std::nullopt, - raft_proto::device_type::gpu); - ASSERT_TRUE(nvforest_model.has_vector_leaves()); + auto tl_model = make_degenerate_tree(std::vector{0.5, 0.5}); + auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); + ASSERT_TRUE(fil_model.has_vector_leaves()); - auto X = thrust::device_vector{0.0}; - auto preds = thrust::device_vector(2, 0.0); - auto h_preds = std::vector(2); + auto X = std::vector{0.0}; + auto preds = std::vector(2, 0.0); auto expected_preds = std::vector{0.5, 0.5}; - for (int i = 0; i < 3; ++i) { - // Repeatedly exercise the auto-resource overload. This interface - // synchronizes the stream before returning. - nvforest_model.predict(thrust::raw_pointer_cast(preds.data()), - thrust::raw_pointer_cast(X.data()), - 1, - raft_proto::device_type::gpu, - raft_proto::device_type::gpu, - nvforest::infer_kind::default_kind, - 1); - thrust::copy(preds.begin(), preds.end(), h_preds.begin()); - ASSERT_EQ(h_preds, expected_preds); - } + fil_model.predict(preds.data(), + X.data(), + 1, + raft_proto::device_type::cpu, + raft_proto::device_type::cpu, + nvforest::infer_kind::default_kind, + 1); + ASSERT_EQ(preds, expected_preds); } } // namespace nvforest diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 46b1b27..8500861 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -195,35 +195,16 @@ Now that the tree model is fully imported into nvForest, let's run inference: .. code-block:: cpp + #include + #include + + raft::handle_t raft_handle{}; + raft_proto::handle_t handle{raft_handle}; + // Assumption: // * Both output and input are in the GPU memory. // * The input buffer should be of dimension (num_rows, num_features) // * The output buffer should be of dimension (num_rows, fm.num_outputs()) - fm.predict(output, input, num_rows, + fm.predict(handle, output, input, num_rows, raft_proto::device_type::gpu, raft_proto::device_type::gpu, nvforest::infer_kind::default_kind); - -The overload shown above auto-instantiates and caches a ``raft::device_resources`` -object and synchronizes before returning. This is the recommended path unless -your application needs to control CUDA stream or stream-pool usage directly. For -that advanced use case, pass an explicit ``raft::device_resources`` object: - -.. code-block:: cpp - - #include - - raft::device_resources resource{}; - fm.predict(resource, output, input, num_rows, - raft_proto::device_type::gpu, raft_proto::device_type::gpu, - nvforest::infer_kind::default_kind); - -.. note:: - - In version 26.06, the C++ prediction API changed from ``raft_proto::handle_t`` - to ``raft::device_resources``. Code that previously constructed a - ``raft_proto::handle_t`` should use the no-resource overload shown above, or - if the previous call site relied on explicit CUDA stream or stream-pool - control, replace calls of the form - ``fm.predict(handle, output, input, num_rows, ...)`` with - ``fm.predict(resource, output, input, num_rows, ...)`` using - ``raft::device_resources``. diff --git a/python/nvforest/nvforest/__init__.py b/python/nvforest/nvforest/__init__.py index 4fc3ff2..4fc9b15 100644 --- a/python/nvforest/nvforest/__init__.py +++ b/python/nvforest/nvforest/__init__.py @@ -24,13 +24,12 @@ GPUForestInferenceClassifier, GPUForestInferenceRegressor, ) -from nvforest._handle import DeviceResources +from nvforest._handle import Handle from nvforest._version import __git_commit__, __version__ __all__ = [ "CPUForestInferenceClassifier", "CPUForestInferenceRegressor", - "DeviceResources", "GPUForestInferenceClassifier", "GPUForestInferenceRegressor", "Handle", @@ -40,17 +39,3 @@ "__git_commit__", "__version__", ] - - -def __getattr__(name): - if name == "Handle": - import warnings - - warnings.warn( - "nvforest.Handle was renamed to nvforest.DeviceResources in 26.06 " - "and will be removed in 26.08.", - FutureWarning, - stacklevel=2, - ) - return DeviceResources - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/python/nvforest/nvforest/_factory.py b/python/nvforest/nvforest/_factory.py index 4943b91..cc0700a 100644 --- a/python/nvforest/nvforest/_factory.py +++ b/python/nvforest/nvforest/_factory.py @@ -17,7 +17,7 @@ infer_device, infer_is_classifier, ) -from nvforest._handle import DeviceResources +from nvforest._handle import Handle def get_forest_inference_class(device, is_classifier) -> type: @@ -39,7 +39,7 @@ def make_forest_inference_object( treelite_model: treelite.Model, device: str, device_id: Optional[int], - handle: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -72,7 +72,7 @@ def load_model( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - handle: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, ) -> ForestInference: """Load a model into nvForest from a serialized model file. @@ -113,10 +113,10 @@ def load_model( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - handle : nvforest.DeviceResources or None - For GPU execution, the nvForest device resources containing the stream - or stream pool to use during loading and inference. If not given, a new - device resources object will be constructed. + handle : nvforest.Handle or None + For GPU execution, the nvForest handle containing the stream or stream + pool to use during loading and inference. If not given, a new + handle will be constructed. """ model_path = pathlib.Path(model_file) if not model_path.exists(): @@ -174,7 +174,7 @@ def load_from_sklearn( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - handle: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, ) -> ForestInference: """Load a Scikit-Learn forest model to nvForest @@ -208,10 +208,10 @@ def load_from_sklearn( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - handle : nvforest.DeviceResources or None - For GPU execution, the nvForest device resources containing the stream - or stream pool to use during loading and inference. If not given, a new - device resources object will be constructed. + handle : nvforest.Handle or None + For GPU execution, the nvForest handle containing the stream or stream + pool to use during loading and inference. If not given, a new + handle will be constructed. """ tl_model = treelite.sklearn.import_model(skl_model) @@ -236,7 +236,7 @@ def load_from_treelite_model( align_bytes: Optional[int] = None, precision: Optional[str] = None, device_id: Optional[int] = None, - handle: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, ) -> ForestInference: """Load a Treelite forest model to nvForest @@ -270,10 +270,10 @@ def load_from_treelite_model( device_id : int or None, default=None For GPU execution, the device on which to load and execute this model. For CPU execution, this value is currently ignored. - handle : nvforest.DeviceResources or None - For GPU execution, the nvForest device resources containing the stream - or stream pool to use during loading and inference. If not given, a new - device resources object will be constructed. + handle : nvforest.Handle or None + For GPU execution, the nvForest handle containing the stream or stream + pool to use during loading and inference. If not given, a new + handle will be constructed. """ return make_forest_inference_object( treelite_model=tl_model, diff --git a/python/nvforest/nvforest/_forest_inference.py b/python/nvforest/nvforest/_forest_inference.py index a7602ae..2c3c933 100644 --- a/python/nvforest/nvforest/_forest_inference.py +++ b/python/nvforest/nvforest/_forest_inference.py @@ -18,7 +18,7 @@ from cuda.bindings import runtime from nvforest._base import ForestInferenceClassifier, ForestInferenceRegressor -from nvforest._handle import DeviceResources +from nvforest._handle import Handle from nvforest._typing import DataType from nvforest.detail.forest_inference import ForestInferenceImpl @@ -123,7 +123,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -310,7 +310,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -334,7 +334,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -425,7 +425,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -449,7 +449,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -532,7 +532,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -557,7 +557,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], @@ -649,7 +649,7 @@ def __init__( self, *, treelite_model: treelite.Model, - handle: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -674,7 +674,7 @@ def _create_with_layout( cls, *, treelite_model_bytes: bytes, - handle: Optional[DeviceResources], + handle: Optional[Handle], layout: str, default_chunk_size: Optional[int], align_bytes: Optional[int], diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py index 878c48a..347b748 100644 --- a/python/nvforest/nvforest/_handle.py +++ b/python/nvforest/nvforest/_handle.py @@ -3,22 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # -import warnings +from pylibraft.common.handle import Handle as RaftHandle -from pylibraft.common.handle import DeviceResources as RaftDeviceResources - -DeviceResources = RaftDeviceResources - -__all__ = ["DeviceResources", "Handle"] - - -def __getattr__(name): - if name == "Handle": - warnings.warn( - "nvforest.Handle was renamed to nvforest.DeviceResources in 26.06 " - "and will be removed in 26.08.", - FutureWarning, - stacklevel=2, - ) - return DeviceResources - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.Handle +Handle = RaftHandle diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx index 3a513b6..0340c59 100644 --- a/python/nvforest/nvforest/detail/forest_inference.pyx +++ b/python/nvforest/nvforest/detail/forest_inference.pyx @@ -8,14 +8,13 @@ from typing import Optional, Union import numpy as np import treelite -from nvforest._handle import DeviceResources +from nvforest._handle import Handle from nvforest._typing import DataType from nvforest.detail.treelite import safe_treelite_call -from cython.operator cimport dereference as deref from libc.stdint cimport uint32_t, uintptr_t from libcpp cimport bool -from rmm.librmm.cuda_stream_view cimport cuda_stream_view +from pylibraft.common.handle cimport handle_t as raft_handle_t from nvforest.detail.infer_kind cimport infer_kind from nvforest.detail.postprocessing cimport element_op, row_op @@ -25,6 +24,7 @@ from nvforest.detail.raft_proto.cuda_stream cimport ( from nvforest.detail.raft_proto.device_type cimport ( device_type as raft_proto_device_t, ) +from nvforest.detail.raft_proto.handle cimport handle_t as raft_proto_handle_t from nvforest.detail.raft_proto.optional cimport nullopt, optional from nvforest.detail.tree_layout cimport tree_layout as nvforest_tree_layout from nvforest.detail.treelite cimport ( @@ -34,18 +34,10 @@ from nvforest.detail.treelite cimport ( ) -cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil: - cdef cppclass device_resources: - device_resources() except + - cuda_stream_view get_next_usable_stream() except + - void sync_stream() except + - void sync_stream_pool() except + - - cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil: cdef cppclass forest_model: void predict[io_t]( - const device_resources&, + const raft_proto_handle_t&, io_t*, io_t*, size_t, @@ -77,13 +69,13 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil: cdef class ForestInference_impl(): cdef forest_model model - cdef object py_handle - cdef device_resources* c_handle + cdef raft_proto_handle_t raft_proto_handle + cdef object raft_handle cdef object device def __cinit__( self, - handle: object, + raft_handle: object, tl_model_bytes: Union[bytes, bytearray], *, layout: str = "depth_first", @@ -92,8 +84,12 @@ cdef class ForestInference_impl(): device: str = "cpu", device_id: Optional[int] = None, ): - self.py_handle = handle - self.c_handle = self.py_handle.getHandle() + # Store reference to RAFT handle to control lifetime, since raft_proto + # handle keeps a pointer to it + self.raft_handle = raft_handle + self.raft_proto_handle = raft_proto_handle_t( + self.raft_handle.getHandle() + ) cdef optional[bool] use_double_precision_c cdef bool use_double_precision_bool @@ -138,7 +134,7 @@ cdef class ForestInference_impl(): use_double_precision_c, dev_type, device_id, - self.c_handle.get_next_usable_stream().value() + self.raft_proto_handle.get_next_usable_stream() ) safe_treelite_call( @@ -248,7 +244,7 @@ cdef class ForestInference_impl(): if model_dtype == np.float32: self.model.predict[float]( - deref(self.c_handle), + self.raft_proto_handle, out_ptr, in_ptr, n_rows, @@ -259,7 +255,7 @@ cdef class ForestInference_impl(): ) else: self.model.predict[double]( - deref(self.c_handle), + self.raft_proto_handle, out_ptr, in_ptr, n_rows, @@ -270,8 +266,7 @@ cdef class ForestInference_impl(): ) if self.device == "gpu": - self.c_handle.sync_stream_pool() - self.c_handle.sync_stream() + self.raft_proto_handle.synchronize() return preds @@ -282,7 +277,7 @@ class ForestInferenceImpl: treelite_model: treelite.Model, device: str, device_id: int, - handle: Optional[DeviceResources] = None, + handle: Optional[Handle] = None, layout: str = "depth_first", default_chunk_size: Optional[int] = None, align_bytes: Optional[int] = None, @@ -290,7 +285,7 @@ class ForestInferenceImpl: ): # Assumption: The caller needs to pass in correct (device, device_id) pair # This function will not contain any logic for auto-detecting device. - self.handle = DeviceResources() if handle is None else handle + self.handle = Handle() if handle is None else handle self._layout = layout self.precision = precision self.default_chunk_size = default_chunk_size diff --git a/python/nvforest/nvforest/detail/raft_proto/handle.pxd b/python/nvforest/nvforest/detail/raft_proto/handle.pxd new file mode 100644 index 0000000..5ad107e --- /dev/null +++ b/python/nvforest/nvforest/detail/raft_proto/handle.pxd @@ -0,0 +1,19 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +from pylibraft.common.handle cimport handle_t as raft_handle_t + +from nvforest.detail.raft_proto.cuda_stream cimport ( + cuda_stream as raft_proto_stream_t, +) + + +cdef extern from "nvforest/detail/raft_proto/handle.hpp" namespace "raft_proto" nogil: + cdef cppclass handle_t: + handle_t() except + + handle_t(const raft_handle_t* handle_ptr) except + + handle_t(const raft_handle_t& handle) except + + raft_proto_stream_t get_next_usable_stream() except + + void synchronize() except+ From 0b2ac1a8793b37693518a5cd921262541a54e700 Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Tue, 19 May 2026 16:15:29 +0000 Subject: [PATCH 34/35] Document auto-instantiated inference path --- cpp/include/nvforest/README.md | 33 +++++++++++++++++++++++++-------- docs/source/getting_started.rst | 17 +++++++++++++---- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md index 3e3df17..a1f83e2 100644 --- a/cpp/include/nvforest/README.md +++ b/cpp/include/nvforest/README.md @@ -106,10 +106,7 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float)); // Assuming that input is a float* pointing to data already located on-device -auto handle = raft_proto::handle_t{}; - nvforest_model.predict( - handle, output, input, num_rows, @@ -119,11 +116,31 @@ nvforest_model.predict( ); ``` -**handle**: To provide a unified interface on CPU and GPU, we introduce -`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a -placeholder in CPU-only builds, and using it does not require any CUDA -functionality. For GPU-enabled builds, you can construct a -`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use. +This is the primary C++ inference path. nvForest creates the RAFT handle it +needs internally and synchronizes before returning. + +Applications that already manage RAFT handles can pass one explicitly: + +```cpp +auto raft_handle = raft::handle_t{}; +auto handle = raft_proto::handle_t{raft_handle}; + +nvforest_model.predict( + handle, + output, + input, + num_rows, + raft_proto::device_type::gpu, + raft_proto::device_type::gpu, + 4 +); +``` + +**handle**: The explicit-handle overload accepts `raft_proto::handle_t`, a +wrapper for `raft::handle_t`. This is currently just a placeholder in CPU-only +builds, and using it does not require any CUDA functionality. For GPU-enabled +builds, construct a `raft_proto::handle_t` directly from the `raft::handle_t` +you wish to use. **output**: Pointer to pre-allocated buffer where results should be written. If the model has been loaded at single precision, this should be a diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 8500861..bbe2710 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -193,6 +193,19 @@ Once the tree model is available as a Treelite object, pass it to the Now that the tree model is fully imported into nvForest, let's run inference: +.. code-block:: cpp + + // Assumption: + // * Both output and input are in the GPU memory. + // * The input buffer should be of dimension (num_rows, num_features) + // * The output buffer should be of dimension (num_rows, fm.num_outputs()) + fm.predict(output, input, num_rows, + raft_proto::device_type::gpu, raft_proto::device_type::gpu, + nvforest::infer_kind::default_kind); + +Applications that want more control over handle ownership, stream reuse, or +synchronization can pass a RAFT handle explicitly like this: + .. code-block:: cpp #include @@ -201,10 +214,6 @@ Now that the tree model is fully imported into nvForest, let's run inference: raft::handle_t raft_handle{}; raft_proto::handle_t handle{raft_handle}; - // Assumption: - // * Both output and input are in the GPU memory. - // * The input buffer should be of dimension (num_rows, num_features) - // * The output buffer should be of dimension (num_rows, fm.num_outputs()) fm.predict(handle, output, input, num_rows, raft_proto::device_type::gpu, raft_proto::device_type::gpu, nvforest::infer_kind::default_kind); From b2f1d32e0311f1c85e50a33bf22f4451132a2516 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 20 May 2026 19:58:46 -0700 Subject: [PATCH 35/35] Add a full docstring for the new predict() --- cpp/include/nvforest/forest_model.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 12c4738..e4256c1 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -311,6 +311,27 @@ struct forest_model { /** * Perform inference on given input using an internally managed RAFT handle. * This function is blocking and synchronizes the handle before returning. + * + * @param[out] output Pointer to the memory location where output should end + * up + * @param[in] input Pointer to the input data + * @param[in] num_rows Number of rows in input + * @param[in] out_mem_type The memory type (device/host) of the output + * buffer + * @param[in] in_mem_type The memory type (device/host) of the input buffer + * @param[in] predict_type Type of inference to perform. Defaults to summing + * the outputs of all trees and produce an output per row. If set to + * "per_tree", we will instead output all outputs of individual trees. + * If set to "leaf_id", we will output the integer ID of the leaf node + * for each tree. + * @param[in] specified_chunk_size: Specifies the mini-batch size for + * processing. This has different meanings on CPU and GPU, but on GPU it + * corresponds to the number of rows evaluated per inference iteration + * on a single block. It can take on any power of 2 from 1 to 32, and + * runtime performance is quite sensitive to the value chosen. In general, + * larger batches benefit from higher values, but it is hard to predict the + * optimal value a priori. If omitted, a heuristic will be used to select a + * reasonable value. On CPU, this argument can generally just be omitted. */ template void predict(io_t* output,