From 35287df0f3ba4cf58a277f33feb2b5cab1d834e4 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Wed, 15 Apr 2026 19:54:50 -0700
Subject: [PATCH 01/35] Create nvforest::handle_t

---
 cpp/include/nvforest/forest_model.hpp | 14 +++----
 cpp/include/nvforest/handle.hpp       | 60 +++++++++++++++++++++++++++
 cpp/tests/treelite_importer.cpp       |  5 ++-
 3 files changed, 70 insertions(+), 9 deletions(-)
 create mode 100644 cpp/include/nvforest/handle.hpp
diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 2b1e6e5..eecda34 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -7,7 +7,7 @@
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/raft_proto/buffer.hpp>
 #include <nvforest/detail/raft_proto/gpu_support.hpp>
-#include <nvforest/detail/raft_proto/handle.hpp>
+#include <nvforest/handle.hpp>
 #include <nvforest/infer_kind.hpp>
 
 #include <cstddef>
@@ -152,8 +152,8 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t
-   * on GPU) which will be used to provide streams for evaluation.
+   * @param[in] handle The nvforest::handle_t which will be used to provide
+   * streams for evaluation.
    * @param[out] output The buffer where model output should be stored. If
    * this buffer is on host while the model is on device or vice versa,
    * work will be distributed across available streams to copy the data back
@@ -177,7 +177,7 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(raft_proto::handle_t const& handle,
+  void predict(handle_t const& handle,
                raft_proto::buffer<io_t>& output,
                raft_proto::buffer<io_t> const& input,
                infer_kind predict_type                        = infer_kind::default_kind,
@@ -252,8 +252,8 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t
-   * on GPU) which will be used to provide streams for evaluation.
+   * @param[in] handle The nvforest::handle_t which will be used to provide
+   * streams for evaluation.
    * @param[out] output Pointer to the memory location where output should end
    * up
    * @param[in] input Pointer to the input data
@@ -276,7 +276,7 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(raft_proto::handle_t const& handle,
+  void predict(handle_t const& handle,
                io_t* output,
                io_t* input,
                std::size_t num_rows,
diff --git a/cpp/include/nvforest/handle.hpp b/cpp/include/nvforest/handle.hpp
new file mode 100644
index 0000000..5ea4f3b
--- /dev/null
+++ b/cpp/include/nvforest/handle.hpp
@@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include <nvforest/detail/raft_proto/handle.hpp>
+
+#ifdef NVFOREST_ENABLE_GPU
+#include <raft/core/handle.hpp>
+
+#include <memory>
+#endif
+
+namespace nvforest {
+
+#ifdef NVFOREST_ENABLE_GPU
+/**
+ * A thin wrapper around raft_proto::handle_t that owns the underlying raft::handle_t.
+ *
+ * Default construction automatically creates both a raft::handle_t and the
+ * raft_proto::handle_t that references it, so callers do not need to manage
+ * RAFT handles directly.
+ */
+struct handle_t {
+  /** Default constructor: creates and owns a raft::handle_t and wraps it */
+  handle_t()
+    : owned_raft_handle_{std::make_unique<raft::handle_t>()},
+      raft_proto_handle_{*owned_raft_handle_}
+  {
+  }
+
+  /** Wrap an externally-owned raft::handle_t without taking ownership */
+  handle_t(raft::handle_t const& raft_handle) : raft_proto_handle_{raft_handle} {}
+
+  auto get_next_usable_stream() const { return raft_proto_handle_.get_next_usable_stream(); }
+  auto get_stream_pool_size() const { return raft_proto_handle_.get_stream_pool_size(); }
+  auto get_usable_stream_count() const { return raft_proto_handle_.get_usable_stream_count(); }
+  void synchronize() const { raft_proto_handle_.synchronize(); }
+
+ private:
+  // Null when wrapping an external raft::handle_t
+  std::unique_ptr<raft::handle_t> owned_raft_handle_;
+  raft_proto::handle_t raft_proto_handle_;
+};
+#else
+/**
+ * CPU-only handle: thin wrapper around the no-op raft_proto::handle_t.
+ */
+struct handle_t {
+  auto get_next_usable_stream() const { return raft_proto_handle_.get_next_usable_stream(); }
+  auto get_stream_pool_size() const { return raft_proto_handle_.get_stream_pool_size(); }
+  auto get_usable_stream_count() const { return raft_proto_handle_.get_usable_stream_count(); }
+  void synchronize() const { raft_proto_handle_.synchronize(); }
+
+ private:
+  raft_proto::handle_t raft_proto_handle_;
+};
+#endif
+
+}  // namespace nvforest
diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp
index 8f946e4..fcd8ad2 100644
--- a/cpp/tests/treelite_importer.cpp
+++ b/cpp/tests/treelite_importer.cpp
@@ -4,6 +4,7 @@
  */
 
 #include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/handle.hpp>
 #include <nvforest/postproc_ops.hpp>
 #include <nvforest/tree_layout.hpp>
 #include <nvforest/treelite_importer.hpp>
@@ -337,7 +338,7 @@ TEST(TreeliteImporter, DegenerateTree)
   auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
   ASSERT_FALSE(fil_model.has_vector_leaves());
 
-  auto handle         = raft::handle_t{};
+  auto handle         = nvforest::handle_t{};
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(1, 0.0);
   auto expected_preds = std::vector<double>{1.0};
@@ -358,7 +359,7 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
   auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
   ASSERT_TRUE(fil_model.has_vector_leaves());
 
-  auto handle         = raft::handle_t{};
+  auto handle         = nvforest::handle_t{};
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(2, 0.0);
   auto expected_preds = std::vector<double>{0.5, 0.5};

From 551bbd23da2f24b7b7097bd3a0bbbe5580ac8823 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Wed, 15 Apr 2026 20:00:56 -0700
Subject: [PATCH 02/35] Update doc

---
 docs/source/getting_started.rst | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 8500861..2522f7d 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -195,11 +195,9 @@ Now that the tree model is fully imported into nvForest, let's run inference:
 
 .. code-block:: cpp
 
-    #include <raft/core/handle.hpp>
-    #include <nvforest/detail/raft_proto/handle.hpp>
+    #include <nvforest/handle.hpp>
 
-    raft::handle_t raft_handle{};
-    raft_proto::handle_t handle{raft_handle};
+    auto handle = nvforest::handle_t{};
 
     // Assumption:
     // * Both output and input are in the GPU memory.

From d8ad61bf2339e78196f00cb168244c52313d6967 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Wed, 15 Apr 2026 20:45:27 -0700
Subject: [PATCH 03/35] Define nvForest handle in Python layer

---
 python/nvforest/CMakeLists.txt                |  6 ++++
 python/nvforest/nvforest/_handle.py           |  5 ++--
 .../nvforest/detail/forest_inference.pyx      | 28 ++++++++-----------
 .../detail/{raft_proto => }/handle.pxd        |  9 +++---
 python/nvforest/nvforest/detail/handle.pyx    | 17 +++++++++++
 5 files changed, 42 insertions(+), 23 deletions(-)
 rename python/nvforest/nvforest/detail/{raft_proto => }/handle.pxd (58%)
 create mode 100644 python/nvforest/nvforest/detail/handle.pyx

diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt
index fd548e5..dfe234d 100644
--- a/python/nvforest/CMakeLists.txt
+++ b/python/nvforest/CMakeLists.txt
@@ -72,6 +72,12 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}"
   INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_)
 
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "nvforest/detail/handle.pyx"
+  LINKED_LIBRARIES "${linked_libraries}"
+  INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_)
+
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "nvforest/detail/treelite.pyx"
diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py
index 347b748..8b68338 100644
--- a/python/nvforest/nvforest/_handle.py
+++ b/python/nvforest/nvforest/_handle.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-from pylibraft.common.handle import Handle as RaftHandle
+from nvforest.detail.handle import Handle as _Handle
 
-# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.Handle
-Handle = RaftHandle
+Handle = _Handle
diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx
index 0340c59..dec4e22 100644
--- a/python/nvforest/nvforest/detail/forest_inference.pyx
+++ b/python/nvforest/nvforest/detail/forest_inference.pyx
@@ -12,10 +12,11 @@ from nvforest._handle import Handle
 from nvforest._typing import DataType
 from nvforest.detail.treelite import safe_treelite_call
 
+from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t, uintptr_t
 from libcpp cimport bool
-from pylibraft.common.handle cimport handle_t as raft_handle_t
 
+from nvforest.detail.handle cimport handle_t
 from nvforest.detail.infer_kind cimport infer_kind
 from nvforest.detail.postprocessing cimport element_op, row_op
 from nvforest.detail.raft_proto.cuda_stream cimport (
@@ -24,7 +25,6 @@ from nvforest.detail.raft_proto.cuda_stream cimport (
 from nvforest.detail.raft_proto.device_type cimport (
     device_type as raft_proto_device_t,
 )
-from nvforest.detail.raft_proto.handle cimport handle_t as raft_proto_handle_t
 from nvforest.detail.raft_proto.optional cimport nullopt, optional
 from nvforest.detail.tree_layout cimport tree_layout as nvforest_tree_layout
 from nvforest.detail.treelite cimport (
@@ -37,7 +37,7 @@ from nvforest.detail.treelite cimport (
 cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil:
     cdef cppclass forest_model:
         void predict[io_t](
-            const raft_proto_handle_t&,
+            const handle_t&,
             io_t*,
             io_t*,
             size_t,
@@ -69,13 +69,13 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil:
 
 cdef class ForestInference_impl():
     cdef forest_model model
-    cdef raft_proto_handle_t raft_proto_handle
-    cdef object raft_handle
+    cdef object py_handle
+    cdef handle_t* c_handle
     cdef object device
 
     def __cinit__(
         self,
-        raft_handle: object,
+        handle: object,
         tl_model_bytes: Union[bytes, bytearray],
         *,
         layout: str = "depth_first",
@@ -84,12 +84,8 @@ cdef class ForestInference_impl():
         device: str = "cpu",
         device_id: Optional[int] = None,
     ):
-        # Store reference to RAFT handle to control lifetime, since raft_proto
-        # handle keeps a pointer to it
-        self.raft_handle = raft_handle
-        self.raft_proto_handle = raft_proto_handle_t(
-            <raft_handle_t*><size_t>self.raft_handle.getHandle()
-        )
+        self.py_handle = handle
+        self.c_handle = <handle_t*><size_t>self.py_handle.getHandle()
 
         cdef optional[bool] use_double_precision_c
         cdef bool use_double_precision_bool
@@ -134,7 +130,7 @@ cdef class ForestInference_impl():
             use_double_precision_c,
             dev_type,
             device_id,
-            self.raft_proto_handle.get_next_usable_stream()
+            self.c_handle.get_next_usable_stream()
         )
 
         safe_treelite_call(
@@ -244,7 +240,7 @@ cdef class ForestInference_impl():
 
         if model_dtype == np.float32:
             self.model.predict[float](
-                self.raft_proto_handle,
+                deref(self.c_handle),
                 <float *> out_ptr,
                 <float *> in_ptr,
                 n_rows,
@@ -255,7 +251,7 @@ cdef class ForestInference_impl():
             )
         else:
             self.model.predict[double](
-                self.raft_proto_handle,
+                deref(self.c_handle),
                 <double *> out_ptr,
                 <double *> in_ptr,
                 n_rows,
@@ -266,7 +262,7 @@ cdef class ForestInference_impl():
             )
 
         if self.device == "gpu":
-            self.raft_proto_handle.synchronize()
+            self.c_handle.synchronize()
         return preds
 
 
diff --git a/python/nvforest/nvforest/detail/raft_proto/handle.pxd b/python/nvforest/nvforest/detail/handle.pxd
similarity index 58%
rename from python/nvforest/nvforest/detail/raft_proto/handle.pxd
rename to python/nvforest/nvforest/detail/handle.pxd
index 5ad107e..033caf3 100644
--- a/python/nvforest/nvforest/detail/raft_proto/handle.pxd
+++ b/python/nvforest/nvforest/detail/handle.pxd
@@ -3,17 +3,18 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-from pylibraft.common.handle cimport handle_t as raft_handle_t
+from libcpp.memory cimport unique_ptr
 
 from nvforest.detail.raft_proto.cuda_stream cimport (
     cuda_stream as raft_proto_stream_t,
 )
 
 
-cdef extern from "nvforest/detail/raft_proto/handle.hpp" namespace "raft_proto" nogil:
+cdef extern from "nvforest/handle.hpp" namespace "nvforest" nogil:
     cdef cppclass handle_t:
         handle_t() except +
-        handle_t(const raft_handle_t* handle_ptr) except +
-        handle_t(const raft_handle_t& handle) except +
         raft_proto_stream_t get_next_usable_stream() except +
         void synchronize() except+
+
+cdef class Handle:
+    cdef unique_ptr[handle_t] c_obj
diff --git a/python/nvforest/nvforest/detail/handle.pyx b/python/nvforest/nvforest/detail/handle.pyx
new file mode 100644
index 0000000..e4e9ae1
--- /dev/null
+++ b/python/nvforest/nvforest/detail/handle.pyx
@@ -0,0 +1,17 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+cdef class Handle:
+    def __cinit__(self, c_obj=None):
+        self.c_obj.reset(new handle_t())
+
+    def __getstate__(self):
+        return object()
+
+    def __setstate__(self, state):
+        self.c_obj.reset(new handle_t())
+
+    def getHandle(self):
+        return <size_t> self.c_obj.get()

From f042cc028868e4e79b925cbfe6adb8609c59bac3 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Wed, 15 Apr 2026 21:01:01 -0700
Subject: [PATCH 04/35] Remove pylibraft dependency

---
 ci/release/update-version.sh                  |  1 -
 .../all_cuda-129_arch-aarch64.yaml            |  1 -
 .../all_cuda-129_arch-x86_64.yaml             |  1 -
 .../all_cuda-131_arch-aarch64.yaml            |  1 -
 .../all_cuda-131_arch-x86_64.yaml             |  1 -
 conda/recipes/nvforest/recipe.yaml            |  2 --
 dependencies.yaml                             | 29 -------------------
 python/agents.md                              |  2 +-
 python/nvforest/pyproject.toml                |  2 --
 9 files changed, 1 insertion(+), 39 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 6e177e8..35e3775 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -112,7 +112,6 @@ DEPENDENCIES=(
   libraft
   libraft-headers
   librmm
-  pylibraft
   rapids-xgboost
   rmm
 )
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 1e40579..03854c5 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -29,7 +29,6 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==26.6.*,>=0.0.0a0
 - pytest
 - pytest-cov
 - pytest-xdist
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index b9b9608..b9dab65 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -29,7 +29,6 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==26.6.*,>=0.0.0a0
 - pytest
 - pytest-cov
 - pytest-xdist
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index 1a7a0d8..062480b 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -29,7 +29,6 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==26.6.*,>=0.0.0a0
 - pytest
 - pytest-cov
 - pytest-xdist
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index 52489d8..4cefcc8 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -29,7 +29,6 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==26.6.*,>=0.0.0a0
 - pytest
 - pytest-cov
 - pytest-xdist
diff --git a/conda/recipes/nvforest/recipe.yaml b/conda/recipes/nvforest/recipe.yaml
index fe70082..90b24e4 100644
--- a/conda/recipes/nvforest/recipe.yaml
+++ b/conda/recipes/nvforest/recipe.yaml
@@ -80,7 +80,6 @@ requirements:
     - cython >=3.0.0
     - libnvforest =${{ version }}
     - pip
-    - pylibraft =${{ minor_version }}
     - python =${{ py_abi_min }}
     - python-abi3 ${{ py_abi_min }}.*
     - rapids-build-backend >=0.4.0,<0.5.0.dev0
@@ -96,7 +95,6 @@ requirements:
     - libnvforest =${{ version }}
     - numpy >=1.23,<3.0a0
     - scikit-learn >=1.4
-    - pylibraft =${{ minor_version }}
     - python
     - treelite ${{ treelite_version }}
     - cuda-cudart
diff --git a/dependencies.yaml b/dependencies.yaml
index ae76ff9..f6b5f86 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -16,7 +16,6 @@ files:
       - depends_on_cupy
       - depends_on_libraft
       - depends_on_librmm
-      - depends_on_pylibraft
       - depends_on_rapids_logger
       - develop
       - docs
@@ -37,7 +36,6 @@ files:
       - depends_on_cupy
       - depends_on_libraft
       - depends_on_librmm
-      - depends_on_pylibraft
       - depends_on_rapids_logger
       - py_build_nvforest
       - py_run_nvforest
@@ -115,7 +113,6 @@ files:
       - depends_on_libnvforest
       - depends_on_libraft
       - depends_on_librmm
-      - depends_on_pylibraft
       - py_build_nvforest
   py_run_nvforest:
     output: pyproject
@@ -126,7 +123,6 @@ files:
       - depends_on_cuda_python
       - depends_on_cupy
       - depends_on_libnvforest
-      - depends_on_pylibraft
       - py_run_nvforest
   py_test_nvforest:
     output: pyproject
@@ -510,31 +506,6 @@ dependencies:
             packages:
               - librmm-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*librmm_unsuffixed]}
-  depends_on_pylibraft:
-    common:
-      - output_types: conda
-        packages:
-          - &pylibraft_unsuffixed pylibraft==26.6.*,>=0.0.0a0
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-    specific:
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - pylibraft-cu12==26.6.*,>=0.0.0a0
-          - matrix:
-              cuda: "13.*"
-              cuda_suffixed: "true"
-            packages:
-              - pylibraft-cu13==26.6.*,>=0.0.0a0
-          - matrix:
-            packages:
-              - *pylibraft_unsuffixed
   depends_on_rapids_logger:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/agents.md b/python/agents.md
index 093ce8a..3e1bd9f 100644
--- a/python/agents.md
+++ b/python/agents.md
@@ -249,7 +249,7 @@ model = treelite.Model.load(filepath, format='xgboost_json')
 **Lightweight Design Philosophy**:
 nvForest must remain a lean, focused inference library. When reviewing changes that add dependencies:
 - **Question every new dependency**: Is it absolutely necessary? Can we achieve the same with existing deps?
-- **Allowed dependencies**: numpy, treelite, and RAPIDS core libs (rmm, pylibraft)
+- **Allowed dependencies**: numpy, treelite, and RAPIDS core libs (rmm)
 - **Avoid**: Large ML frameworks, libraries with heavy transitive dependencies, optional "nice-to-have" deps
 - **Install size matters**: New deps should not significantly increase wheel/conda package size
 - **Runtime dependencies are costly**: Each new import adds startup time and potential version conflicts
diff --git a/python/nvforest/pyproject.toml b/python/nvforest/pyproject.toml
index 5fae739..b88eac0 100644
--- a/python/nvforest/pyproject.toml
+++ b/python/nvforest/pyproject.toml
@@ -31,7 +31,6 @@ dependencies = [
     "cupy-cuda13x>=13.6.0",
     "libnvforest==26.6.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
-    "pylibraft==26.6.*,>=0.0.0a0",
     "scikit-learn>=1.5",
     "treelite>=4.6.1,<5.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -97,7 +96,6 @@ requires = [
     "libraft==26.6.*,>=0.0.0a0",
     "librmm==26.6.*,>=0.0.0a0",
     "ninja",
-    "pylibraft==26.6.*,>=0.0.0a0",
     "treelite>=4.6.1,<5.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From 97a897661e1b3d0fd78edd69c7fd7db2859515da Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Thu, 16 Apr 2026 11:19:34 -0700
Subject: [PATCH 05/35] Add libraft-headers to Conda recipe

---
 conda/recipes/nvforest/recipe.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/recipes/nvforest/recipe.yaml b/conda/recipes/nvforest/recipe.yaml
index 90b24e4..a1c58cf 100644
--- a/conda/recipes/nvforest/recipe.yaml
+++ b/conda/recipes/nvforest/recipe.yaml
@@ -80,6 +80,7 @@ requirements:
     - cython >=3.0.0
     - libnvforest =${{ version }}
     - pip
+    - libraft-headers =${{ minor_version }}
     - python =${{ py_abi_min }}
     - python-abi3 ${{ py_abi_min }}.*
     - rapids-build-backend >=0.4.0,<0.5.0.dev0

From ab44b9223ef510f6146448612cb68a00d0778fcd Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Thu, 16 Apr 2026 14:22:56 -0700
Subject: [PATCH 06/35] Remove unused parameter

---
 python/nvforest/nvforest/detail/handle.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/nvforest/nvforest/detail/handle.pyx b/python/nvforest/nvforest/detail/handle.pyx
index e4e9ae1..9f517ab 100644
--- a/python/nvforest/nvforest/detail/handle.pyx
+++ b/python/nvforest/nvforest/detail/handle.pyx
@@ -4,7 +4,7 @@
 #
 
 cdef class Handle:
-    def __cinit__(self, c_obj=None):
+    def __cinit__(self):
         self.c_obj.reset(new handle_t())
 
     def __getstate__(self):

From b781fda030aef2e07039ebb5495bd4aa181dcc43 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Thu, 16 Apr 2026 16:51:51 -0700
Subject: [PATCH 07/35] Revert "Add libraft-headers to Conda recipe"

This reverts commit 97a897661e1b3d0fd78edd69c7fd7db2859515da.
---
 conda/recipes/nvforest/recipe.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conda/recipes/nvforest/recipe.yaml b/conda/recipes/nvforest/recipe.yaml
index a1c58cf..90b24e4 100644
--- a/conda/recipes/nvforest/recipe.yaml
+++ b/conda/recipes/nvforest/recipe.yaml
@@ -80,7 +80,6 @@ requirements:
     - cython >=3.0.0
     - libnvforest =${{ version }}
     - pip
-    - libraft-headers =${{ minor_version }}
     - python =${{ py_abi_min }}
     - python-abi3 ${{ py_abi_min }}.*
     - rapids-build-backend >=0.4.0,<0.5.0.dev0

From 9000a73d1294512b301db931473b48f0842e6f3d Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Thu, 16 Apr 2026 17:09:45 -0700
Subject: [PATCH 08/35] Revert "Remove pylibraft dependency"

This reverts commit f042cc028868e4e79b925cbfe6adb8609c59bac3.
---
 ci/release/update-version.sh                  |  1 +
 .../all_cuda-129_arch-aarch64.yaml            |  1 +
 .../all_cuda-129_arch-x86_64.yaml             |  1 +
 .../all_cuda-131_arch-aarch64.yaml            |  1 +
 .../all_cuda-131_arch-x86_64.yaml             |  1 +
 conda/recipes/nvforest/recipe.yaml            |  2 ++
 dependencies.yaml                             | 29 +++++++++++++++++++
 python/agents.md                              |  2 +-
 python/nvforest/pyproject.toml                |  2 ++
 9 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 35e3775..6e177e8 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -112,6 +112,7 @@ DEPENDENCIES=(
   libraft
   libraft-headers
   librmm
+  pylibraft
   rapids-xgboost
   rmm
 )
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 03854c5..1e40579 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -29,6 +29,7 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
+- pylibraft==26.6.*,>=0.0.0a0
 - pytest
 - pytest-cov
 - pytest-xdist
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index b9dab65..b9b9608 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -29,6 +29,7 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
+- pylibraft==26.6.*,>=0.0.0a0
 - pytest
 - pytest-cov
 - pytest-xdist
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index 062480b..1a7a0d8 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -29,6 +29,7 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
+- pylibraft==26.6.*,>=0.0.0a0
 - pytest
 - pytest-cov
 - pytest-xdist
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index 4cefcc8..52489d8 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -29,6 +29,7 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
+- pylibraft==26.6.*,>=0.0.0a0
 - pytest
 - pytest-cov
 - pytest-xdist
diff --git a/conda/recipes/nvforest/recipe.yaml b/conda/recipes/nvforest/recipe.yaml
index 90b24e4..fe70082 100644
--- a/conda/recipes/nvforest/recipe.yaml
+++ b/conda/recipes/nvforest/recipe.yaml
@@ -80,6 +80,7 @@ requirements:
     - cython >=3.0.0
     - libnvforest =${{ version }}
     - pip
+    - pylibraft =${{ minor_version }}
     - python =${{ py_abi_min }}
     - python-abi3 ${{ py_abi_min }}.*
     - rapids-build-backend >=0.4.0,<0.5.0.dev0
@@ -95,6 +96,7 @@ requirements:
     - libnvforest =${{ version }}
     - numpy >=1.23,<3.0a0
     - scikit-learn >=1.4
+    - pylibraft =${{ minor_version }}
     - python
     - treelite ${{ treelite_version }}
     - cuda-cudart
diff --git a/dependencies.yaml b/dependencies.yaml
index f6b5f86..ae76ff9 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -16,6 +16,7 @@ files:
       - depends_on_cupy
       - depends_on_libraft
       - depends_on_librmm
+      - depends_on_pylibraft
       - depends_on_rapids_logger
       - develop
       - docs
@@ -36,6 +37,7 @@ files:
       - depends_on_cupy
       - depends_on_libraft
       - depends_on_librmm
+      - depends_on_pylibraft
       - depends_on_rapids_logger
       - py_build_nvforest
       - py_run_nvforest
@@ -113,6 +115,7 @@ files:
       - depends_on_libnvforest
       - depends_on_libraft
       - depends_on_librmm
+      - depends_on_pylibraft
       - py_build_nvforest
   py_run_nvforest:
     output: pyproject
@@ -123,6 +126,7 @@ files:
       - depends_on_cuda_python
       - depends_on_cupy
       - depends_on_libnvforest
+      - depends_on_pylibraft
       - py_run_nvforest
   py_test_nvforest:
     output: pyproject
@@ -506,6 +510,31 @@ dependencies:
             packages:
               - librmm-cu13==26.6.*,>=0.0.0a0
           - {matrix: null, packages: [*librmm_unsuffixed]}
+  depends_on_pylibraft:
+    common:
+      - output_types: conda
+        packages:
+          - &pylibraft_unsuffixed pylibraft==26.6.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibraft-cu12==26.6.*,>=0.0.0a0
+          - matrix:
+              cuda: "13.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibraft-cu13==26.6.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *pylibraft_unsuffixed
   depends_on_rapids_logger:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/agents.md b/python/agents.md
index 3e1bd9f..093ce8a 100644
--- a/python/agents.md
+++ b/python/agents.md
@@ -249,7 +249,7 @@ model = treelite.Model.load(filepath, format='xgboost_json')
 **Lightweight Design Philosophy**:
 nvForest must remain a lean, focused inference library. When reviewing changes that add dependencies:
 - **Question every new dependency**: Is it absolutely necessary? Can we achieve the same with existing deps?
-- **Allowed dependencies**: numpy, treelite, and RAPIDS core libs (rmm)
+- **Allowed dependencies**: numpy, treelite, and RAPIDS core libs (rmm, pylibraft)
 - **Avoid**: Large ML frameworks, libraries with heavy transitive dependencies, optional "nice-to-have" deps
 - **Install size matters**: New deps should not significantly increase wheel/conda package size
 - **Runtime dependencies are costly**: Each new import adds startup time and potential version conflicts
diff --git a/python/nvforest/pyproject.toml b/python/nvforest/pyproject.toml
index b88eac0..5fae739 100644
--- a/python/nvforest/pyproject.toml
+++ b/python/nvforest/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "cupy-cuda13x>=13.6.0",
     "libnvforest==26.6.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
+    "pylibraft==26.6.*,>=0.0.0a0",
     "scikit-learn>=1.5",
     "treelite>=4.6.1,<5.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -96,6 +97,7 @@ requires = [
     "libraft==26.6.*,>=0.0.0a0",
     "librmm==26.6.*,>=0.0.0a0",
     "ninja",
+    "pylibraft==26.6.*,>=0.0.0a0",
     "treelite>=4.6.1,<5.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From cd9bece457f113b0865eb11ec82c6c90588328ab Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Thu, 16 Apr 2026 17:10:31 -0700
Subject: [PATCH 09/35] Remove custom handle types entirely

---
 .../nvforest/detail/raft_proto/handle.hpp     | 43 -------------
 cpp/include/nvforest/forest_model.hpp         | 24 ++++----
 cpp/include/nvforest/handle.hpp               | 60 -------------------
 cpp/tests/treelite_importer.cpp               | 11 ++--
 python/nvforest/CMakeLists.txt                |  6 --
 python/nvforest/nvforest/_handle.py           |  5 +-
 .../nvforest/detail/forest_inference.pyx      | 21 +++++--
 python/nvforest/nvforest/detail/handle.pxd    | 20 -------
 python/nvforest/nvforest/detail/handle.pyx    | 17 ------
 9 files changed, 37 insertions(+), 170 deletions(-)
 delete mode 100644 cpp/include/nvforest/detail/raft_proto/handle.hpp
 delete mode 100644 cpp/include/nvforest/handle.hpp
 delete mode 100644 python/nvforest/nvforest/detail/handle.pxd
 delete mode 100644 python/nvforest/nvforest/detail/handle.pyx

diff --git a/cpp/include/nvforest/detail/raft_proto/handle.hpp b/cpp/include/nvforest/detail/raft_proto/handle.hpp
deleted file mode 100644
index 086d61b..0000000
--- a/cpp/include/nvforest/detail/raft_proto/handle.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-
-#include <algorithm>
-#include <cstddef>
-#ifdef NVFOREST_ENABLE_GPU
-#include <raft/core/handle.hpp>
-#endif
-
-namespace raft_proto {
-#ifdef NVFOREST_ENABLE_GPU
-struct handle_t {
-  handle_t(raft::handle_t const* handle_ptr = nullptr) : raft_handle_{handle_ptr} {}
-  handle_t(raft::handle_t const& raft_handle) : raft_handle_{&raft_handle} {}
-  auto get_next_usable_stream() const
-  {
-    return raft_proto::cuda_stream{raft_handle_->get_next_usable_stream().value()};
-  }
-  auto get_stream_pool_size() const { return raft_handle_->get_stream_pool_size(); }
-  auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); }
-  void synchronize() const
-  {
-    raft_handle_->sync_stream_pool();
-    raft_handle_->sync_stream();
-  }
-
- private:
-  // Have to store a pointer because handle is not movable
-  raft::handle_t const* raft_handle_;
-};
-#else
-struct handle_t {
-  auto get_next_usable_stream() const { return raft_proto::cuda_stream{}; }
-  auto get_stream_pool_size() const { return std::size_t{}; }
-  auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); }
-  void synchronize() const {}
-};
-#endif
-}  // namespace raft_proto
diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index eecda34..2cd36df 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -7,9 +7,10 @@
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/raft_proto/buffer.hpp>
 #include <nvforest/detail/raft_proto/gpu_support.hpp>
-#include <nvforest/handle.hpp>
 #include <nvforest/infer_kind.hpp>
 
+#include <raft/core/device_resources.hpp>
+
 #include <cstddef>
 #include <type_traits>
 #include <variant>
@@ -152,7 +153,7 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] handle The nvforest::handle_t which will be used to provide
+   * @param[in] resource RAFT resource which will be used to provide
    * streams for evaluation.
    * @param[out] output The buffer where model output should be stored. If
    * this buffer is on host while the model is on device or vice versa,
@@ -177,31 +178,32 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(handle_t const& handle,
+  void predict(raft::device_resources const& resource,
                raft_proto::buffer<io_t>& output,
                raft_proto::buffer<io_t> const& input,
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
     std::visit(
-      [this, predict_type, &handle, &output, &input, &specified_chunk_size](
+      [this, predict_type, &resource, &output, &input, &specified_chunk_size](
         auto&& concrete_forest) {
         using model_io_t = typename std::remove_reference_t<decltype(concrete_forest)>::io_type;
         if constexpr (std::is_same_v<model_io_t, io_t>) {
           if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) {
             concrete_forest.predict(
-              output, input, handle.get_next_usable_stream(), predict_type, specified_chunk_size);
+              output, input, resource.get_next_usable_stream(), predict_type, specified_chunk_size);
           } else {
             auto constexpr static const MIN_CHUNKS_PER_PARTITION = std::size_t{64};
             auto constexpr static const MAX_CHUNK_SIZE           = std::size_t{64};
 
-            auto row_count = input.size() / num_features();
+            auto row_count           = input.size() / num_features();
+            auto usable_stream_count = std::max(resource.get_stream_pool_size(), std::size_t{1});
             auto partition_size =
-              std::max(raft_proto::ceildiv(row_count, handle.get_usable_stream_count()),
+              std::max(raft_proto::ceildiv(row_count, usable_stream_count),
                        specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION);
             auto partition_count = raft_proto::ceildiv(row_count, partition_size);
             for (auto i = std::size_t{}; i < partition_count; ++i) {
-              auto stream = handle.get_next_usable_stream();
+              auto stream = resource.get_next_usable_stream();
               auto rows_in_this_partition =
                 std::min(partition_size, row_count - i * partition_size);
               auto partition_in = raft_proto::buffer<io_t>{};
@@ -252,7 +254,7 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] handle The nvforest::handle_t which will be used to provide
+   * @param[in] resource RAFT resource which will be used to provide
    * streams for evaluation.
    * @param[out] output Pointer to the memory location where output should end
    * up
@@ -276,7 +278,7 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(handle_t const& handle,
+  void predict(raft::device_resources const& resource,
                io_t* output,
                io_t* input,
                std::size_t num_rows,
@@ -296,7 +298,7 @@ struct forest_model {
       raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id};
     auto in_buffer =
       raft_proto::buffer{input, num_rows * num_features(), in_mem_type, current_device_id};
-    predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size);
+    predict(resource, out_buffer, in_buffer, predict_type, specified_chunk_size);
   }
 
  private:
diff --git a/cpp/include/nvforest/handle.hpp b/cpp/include/nvforest/handle.hpp
deleted file mode 100644
index 5ea4f3b..0000000
--- a/cpp/include/nvforest/handle.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-#include <nvforest/detail/raft_proto/handle.hpp>
-
-#ifdef NVFOREST_ENABLE_GPU
-#include <raft/core/handle.hpp>
-
-#include <memory>
-#endif
-
-namespace nvforest {
-
-#ifdef NVFOREST_ENABLE_GPU
-/**
- * A thin wrapper around raft_proto::handle_t that owns the underlying raft::handle_t.
- *
- * Default construction automatically creates both a raft::handle_t and the
- * raft_proto::handle_t that references it, so callers do not need to manage
- * RAFT handles directly.
- */
-struct handle_t {
-  /** Default constructor: creates and owns a raft::handle_t and wraps it */
-  handle_t()
-    : owned_raft_handle_{std::make_unique<raft::handle_t>()},
-      raft_proto_handle_{*owned_raft_handle_}
-  {
-  }
-
-  /** Wrap an externally-owned raft::handle_t without taking ownership */
-  handle_t(raft::handle_t const& raft_handle) : raft_proto_handle_{raft_handle} {}
-
-  auto get_next_usable_stream() const { return raft_proto_handle_.get_next_usable_stream(); }
-  auto get_stream_pool_size() const { return raft_proto_handle_.get_stream_pool_size(); }
-  auto get_usable_stream_count() const { return raft_proto_handle_.get_usable_stream_count(); }
-  void synchronize() const { raft_proto_handle_.synchronize(); }
-
- private:
-  // Null when wrapping an external raft::handle_t
-  std::unique_ptr<raft::handle_t> owned_raft_handle_;
-  raft_proto::handle_t raft_proto_handle_;
-};
-#else
-/**
- * CPU-only handle: thin wrapper around the no-op raft_proto::handle_t.
- */
-struct handle_t {
-  auto get_next_usable_stream() const { return raft_proto_handle_.get_next_usable_stream(); }
-  auto get_stream_pool_size() const { return raft_proto_handle_.get_stream_pool_size(); }
-  auto get_usable_stream_count() const { return raft_proto_handle_.get_usable_stream_count(); }
-  void synchronize() const { raft_proto_handle_.synchronize(); }
-
- private:
-  raft_proto::handle_t raft_proto_handle_;
-};
-#endif
-
-}  // namespace nvforest
diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp
index fcd8ad2..f2b3083 100644
--- a/cpp/tests/treelite_importer.cpp
+++ b/cpp/tests/treelite_importer.cpp
@@ -4,11 +4,12 @@
  */
 
 #include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/handle.hpp>
 #include <nvforest/postproc_ops.hpp>
 #include <nvforest/tree_layout.hpp>
 #include <nvforest/treelite_importer.hpp>
 
+#include <raft/core/device_resources.hpp>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <treelite/enum/task_type.h>
@@ -338,11 +339,11 @@ TEST(TreeliteImporter, DegenerateTree)
   auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
   ASSERT_FALSE(fil_model.has_vector_leaves());
 
-  auto handle         = nvforest::handle_t{};
+  auto resource       = raft::device_resources{};
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(1, 0.0);
   auto expected_preds = std::vector<double>{1.0};
-  fil_model.predict(handle,
+  fil_model.predict(resource,
                     preds.data(),
                     X.data(),
                     1,
@@ -359,11 +360,11 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
   auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
   ASSERT_TRUE(fil_model.has_vector_leaves());
 
-  auto handle         = nvforest::handle_t{};
+  auto resource       = raft::device_resources{};
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(2, 0.0);
   auto expected_preds = std::vector<double>{0.5, 0.5};
-  fil_model.predict(handle,
+  fil_model.predict(resource,
                     preds.data(),
                     X.data(),
                     1,
diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt
index dfe234d..fd548e5 100644
--- a/python/nvforest/CMakeLists.txt
+++ b/python/nvforest/CMakeLists.txt
@@ -72,12 +72,6 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}"
   INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_)
 
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "nvforest/detail/handle.pyx"
-  LINKED_LIBRARIES "${linked_libraries}"
-  INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_)
-
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "nvforest/detail/treelite.pyx"
diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py
index 8b68338..0633eee 100644
--- a/python/nvforest/nvforest/_handle.py
+++ b/python/nvforest/nvforest/_handle.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-from nvforest.detail.handle import Handle as _Handle
+from pylibraft.common.handle import DeviceResources as RaftDeviceResources
 
-Handle = _Handle
+# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.DeviceResources
+Handle = RaftDeviceResources
diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx
index dec4e22..946c3fa 100644
--- a/python/nvforest/nvforest/detail/forest_inference.pyx
+++ b/python/nvforest/nvforest/detail/forest_inference.pyx
@@ -15,8 +15,8 @@ from nvforest.detail.treelite import safe_treelite_call
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t, uintptr_t
 from libcpp cimport bool
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 
-from nvforest.detail.handle cimport handle_t
 from nvforest.detail.infer_kind cimport infer_kind
 from nvforest.detail.postprocessing cimport element_op, row_op
 from nvforest.detail.raft_proto.cuda_stream cimport (
@@ -34,10 +34,18 @@ from nvforest.detail.treelite cimport (
 )
 
 
+cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil:
+    cdef cppclass device_resources:
+        device_resources() except +
+        cuda_stream_view get_next_usable_stream() except +
+        void sync_stream() except +
+        void sync_stream_pool() except +
+
+
 cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil:
     cdef cppclass forest_model:
         void predict[io_t](
-            const handle_t&,
+            const device_resources&,
             io_t*,
             io_t*,
             size_t,
@@ -70,7 +78,7 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil:
 cdef class ForestInference_impl():
     cdef forest_model model
     cdef object py_handle
-    cdef handle_t* c_handle
+    cdef device_resources* c_handle
     cdef object device
 
     def __cinit__(
@@ -85,7 +93,7 @@ cdef class ForestInference_impl():
         device_id: Optional[int] = None,
     ):
         self.py_handle = handle
-        self.c_handle = <handle_t*><size_t>self.py_handle.getHandle()
+        self.c_handle = <device_resources*><size_t>self.py_handle.getHandle()
 
         cdef optional[bool] use_double_precision_c
         cdef bool use_double_precision_bool
@@ -130,7 +138,7 @@ cdef class ForestInference_impl():
             use_double_precision_c,
             dev_type,
             device_id,
-            self.c_handle.get_next_usable_stream()
+            <raft_proto_stream_t> self.c_handle.get_next_usable_stream().value()
         )
 
         safe_treelite_call(
@@ -262,7 +270,8 @@ cdef class ForestInference_impl():
             )
 
         if self.device == "gpu":
-            self.c_handle.synchronize()
+            self.c_handle.sync_stream_pool()
+            self.c_handle.sync_stream()
         return preds
 
 
diff --git a/python/nvforest/nvforest/detail/handle.pxd b/python/nvforest/nvforest/detail/handle.pxd
deleted file mode 100644
index 033caf3..0000000
--- a/python/nvforest/nvforest/detail/handle.pxd
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-#
-
-from libcpp.memory cimport unique_ptr
-
-from nvforest.detail.raft_proto.cuda_stream cimport (
-    cuda_stream as raft_proto_stream_t,
-)
-
-
-cdef extern from "nvforest/handle.hpp" namespace "nvforest" nogil:
-    cdef cppclass handle_t:
-        handle_t() except +
-        raft_proto_stream_t get_next_usable_stream() except +
-        void synchronize() except+
-
-cdef class Handle:
-    cdef unique_ptr[handle_t] c_obj
diff --git a/python/nvforest/nvforest/detail/handle.pyx b/python/nvforest/nvforest/detail/handle.pyx
deleted file mode 100644
index 9f517ab..0000000
--- a/python/nvforest/nvforest/detail/handle.pyx
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-#
-
-cdef class Handle:
-    def __cinit__(self):
-        self.c_obj.reset(new handle_t())
-
-    def __getstate__(self):
-        return object()
-
-    def __setstate__(self, state):
-        self.c_obj.reset(new handle_t())
-
-    def getHandle(self):
-        return <size_t> self.c_obj.get()

From fb4355affc443b0111e154d00676fb322a1f06e4 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Thu, 16 Apr 2026 17:43:32 -0700
Subject: [PATCH 10/35] Add new interface to auto-instantiate raft resource

---
 cpp/include/nvforest/forest_model.hpp | 44 +++++++++++++++++++++++++++
 cpp/tests/treelite_importer.cpp       |  4 +--
 docs/source/getting_started.rst       |  6 +---
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 2cd36df..be45584 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -301,6 +301,50 @@ struct forest_model {
     predict(resource, out_buffer, in_buffer, predict_type, specified_chunk_size);
   }
 
+  /**
+   * Perform inference on given input (with auto-instantiated RAFT resource)
+   *
+   * @param[out] output Pointer to the memory location where output should end
+   * up
+   * @param[in] input Pointer to the input data
+   * @param[in] num_rows Number of rows in input
+   * @param[in] out_mem_type The memory type (device/host) of the output
+   * buffer
+   * @param[in] in_mem_type The memory type (device/host) of the input buffer
+   * @param[in] predict_type Type of inference to perform. Defaults to summing
+   * the outputs of all trees and produce an output per row. If set to
+   * "per_tree", we will instead output all outputs of individual trees.
+   * If set to "leaf_id", we will output the integer ID of the leaf node
+   * for each tree.
+   * @param[in] specified_chunk_size: Specifies the mini-batch size for
+   * processing. This has different meanings on CPU and GPU, but on GPU it
+   * corresponds to the number of rows evaluated per inference iteration
+   * on a single block. It can take on any power of 2 from 1 to 32, and
+   * runtime performance is quite sensitive to the value chosen. In general,
+   * larger batches benefit from higher values, but it is hard to predict the
+   * optimal value a priori. If omitted, a heuristic will be used to select a
+   * reasonable value. On CPU, this argument can generally just be omitted.
+   */
+  template <typename io_t>
+  void predict(io_t* output,
+               io_t* input,
+               std::size_t num_rows,
+               raft_proto::device_type out_mem_type,
+               raft_proto::device_type in_mem_type,
+               infer_kind predict_type                        = infer_kind::default_kind,
+               std::optional<index_type> specified_chunk_size = std::nullopt)
+  {
+    auto resource = raft::device_resources{};
+    predict(resource,
+            output,
+            input,
+            num_rows,
+            out_mem_type,
+            in_mem_type,
+            predict_type,
+            specified_chunk_size);
+  }
+
  private:
   decision_forest_variant decision_forest_;
 };
diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp
index f2b3083..f6f3413 100644
--- a/cpp/tests/treelite_importer.cpp
+++ b/cpp/tests/treelite_importer.cpp
@@ -360,12 +360,10 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
   auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
   ASSERT_TRUE(fil_model.has_vector_leaves());
 
-  auto resource       = raft::device_resources{};
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(2, 0.0);
   auto expected_preds = std::vector<double>{0.5, 0.5};
-  fil_model.predict(resource,
-                    preds.data(),
+  fil_model.predict(preds.data(),
                     X.data(),
                     1,
                     raft_proto::device_type::cpu,
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 2522f7d..e94b666 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -195,14 +195,10 @@ Now that the tree model is fully imported into nvForest, let's run inference:
 
 .. code-block:: cpp
 
-    #include <nvforest/handle.hpp>
-
-    auto handle = nvforest::handle_t{};
-
     // Assumption:
     // * Both output and input are in the GPU memory.
     // * The input buffer should be of dimension (num_rows, num_features)
     // * The output buffer should be of dimension (num_rows, fm.num_outputs())
-    fm.predict(handle, output, input, num_rows,
+    fm.predict(output, input, num_rows,
                raft_proto::device_type::gpu, raft_proto::device_type::gpu,
                nvforest::infer_kind::default_kind);

From f4c6fee4c6ebd1bd7d4bcb289b41a963a760cc2e Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Fri, 17 Apr 2026 21:16:13 -0700
Subject: [PATCH 11/35] Add a note about re-using the RAFT handle

---
 docs/source/getting_started.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index e94b666..662e09a 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -202,3 +202,20 @@ Now that the tree model is fully imported into nvForest, let's run inference:
     fm.predict(output, input, num_rows,
                raft_proto::device_type::gpu, raft_proto::device_type::gpu,
                nvforest::infer_kind::default_kind);
+
+.. note:: Reuse the resource handle to reduce overhead
+
+    nvForest internally creates a resource handle (``raft::device_resources``)
+    to manage GPU resources. Creation of the resource handle adds a slight
+    performance overhead. If you plan to call :cpp:func:`nvforest::predict`
+    multiple times, consider creating the resource handle explicitly
+    and re-using the handle between the function calls.
+
+    .. code-block:: cpp
+
+        #include <raft/core/device_resources.hpp>
+
+        auto resource = raft::device_resources{};
+        fm.predict(resource, output, input, num_rows,
+                   raft_proto::device_type::gpu, raft_proto::device_type::gpu,
+                   nvforest::infer_kind::default_kind);

From 59cfadc205c6ef55fcfcbd6454849e7290b710c9 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Fri, 17 Apr 2026 21:19:21 -0700
Subject: [PATCH 12/35] Add a comment

---
 docs/source/getting_started.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 662e09a..f5d10db 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -216,6 +216,8 @@ Now that the tree model is fully imported into nvForest, let's run inference:
         #include <raft/core/device_resources.hpp>
 
         auto resource = raft::device_resources{};
+
+        // Calling predict multiple times ...
         fm.predict(resource, output, input, num_rows,
                    raft_proto::device_type::gpu, raft_proto::device_type::gpu,
                    nvforest::infer_kind::default_kind);

From 8efe44e5edddca432dd109da19854c9b293a5a40 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Sat, 18 Apr 2026 14:03:52 -0700
Subject: [PATCH 13/35] Improved formatting

---
 docs/source/getting_started.rst | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index f5d10db..3257387 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -205,11 +205,13 @@ Now that the tree model is fully imported into nvForest, let's run inference:
 
 .. note:: Reuse the resource handle to reduce overhead
 
-    nvForest internally creates a resource handle (``raft::device_resources``)
+    nvForest internally creates a resource handle
+    (:cpp:class:`raft::device_resources`)
     to manage GPU resources. Creation of the resource handle adds a slight
-    performance overhead. If you plan to call :cpp:func:`nvforest::predict`
-    multiple times, consider creating the resource handle explicitly
-    and re-using the handle between the function calls.
+    performance overhead. If you plan to call
+    :cpp:func:`~nvforest::forest_model::predict` multiple times, consider
+    creating the resource handle explicitly and re-using the handle between
+    the function calls.
 
     .. code-block:: cpp
 

From 564e38c4f9900838a6cb8ea1f99c813e484075ec Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 21 Apr 2026 18:50:05 -0700
Subject: [PATCH 14/35] Create device_resource wrapper in C++

---
 cpp/include/nvforest/device_resources.hpp | 22 ++++++++++++++++++++++
 cpp/include/nvforest/forest_model.hpp     | 15 +++++++--------
 cpp/tests/treelite_importer.cpp           |  5 ++---
 3 files changed, 31 insertions(+), 11 deletions(-)
 create mode 100644 cpp/include/nvforest/device_resources.hpp

diff --git a/cpp/include/nvforest/device_resources.hpp b/cpp/include/nvforest/device_resources.hpp
new file mode 100644
index 0000000..04621d3
--- /dev/null
+++ b/cpp/include/nvforest/device_resources.hpp
@@ -0,0 +1,22 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/core/device_resources.hpp>
+
+namespace nvforest {
+
+class device_resources {
+ public:
+  device_resources() : res_{} {}
+
+  auto get_next_usable_stream() const { return res_.get_next_usable_stream(); }
+  auto get_stream_pool_size() const { return res_.get_stream_pool_size(); }
+
+ private:
+  raft::device_resources res_;
+};
+
+}  // namespace nvforest
diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index be45584..84987a8 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -7,10 +7,9 @@
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/raft_proto/buffer.hpp>
 #include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/device_resources.hpp>
 #include <nvforest/infer_kind.hpp>
 
-#include <raft/core/device_resources.hpp>
-
 #include <cstddef>
 #include <type_traits>
 #include <variant>
@@ -153,7 +152,7 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] resource RAFT resource which will be used to provide
+   * @param[in] resource device resource which will be used to provide
    * streams for evaluation.
    * @param[out] output The buffer where model output should be stored. If
    * this buffer is on host while the model is on device or vice versa,
@@ -178,7 +177,7 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(raft::device_resources const& resource,
+  void predict(nvforest::device_resources const& resource,
                raft_proto::buffer<io_t>& output,
                raft_proto::buffer<io_t> const& input,
                infer_kind predict_type                        = infer_kind::default_kind,
@@ -254,7 +253,7 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] resource RAFT resource which will be used to provide
+   * @param[in] resource device resource which will be used to provide
    * streams for evaluation.
    * @param[out] output Pointer to the memory location where output should end
    * up
@@ -278,7 +277,7 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(raft::device_resources const& resource,
+  void predict(nvforest::device_resources const& resource,
                io_t* output,
                io_t* input,
                std::size_t num_rows,
@@ -302,7 +301,7 @@ struct forest_model {
   }
 
   /**
-   * Perform inference on given input (with auto-instantiated RAFT resource)
+   * Perform inference on given input (with auto-instantiated device resource)
    *
    * @param[out] output Pointer to the memory location where output should end
    * up
@@ -334,7 +333,7 @@ struct forest_model {
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
-    auto resource = raft::device_resources{};
+    auto resource = nvforest::device_resources{};
     predict(resource,
             output,
             input,
diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp
index f6f3413..d79c423 100644
--- a/cpp/tests/treelite_importer.cpp
+++ b/cpp/tests/treelite_importer.cpp
@@ -4,12 +4,11 @@
  */
 
 #include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/device_resources.hpp>
 #include <nvforest/postproc_ops.hpp>
 #include <nvforest/tree_layout.hpp>
 #include <nvforest/treelite_importer.hpp>
 
-#include <raft/core/device_resources.hpp>
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <treelite/enum/task_type.h>
@@ -339,7 +338,7 @@ TEST(TreeliteImporter, DegenerateTree)
   auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
   ASSERT_FALSE(fil_model.has_vector_leaves());
 
-  auto resource       = raft::device_resources{};
+  auto resource       = nvforest::device_resources{};
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(1, 0.0);
   auto expected_preds = std::vector<double>{1.0};

From 13ddf90417114f9426f15147b162bd86520e9b2d Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 21 Apr 2026 19:02:44 -0700
Subject: [PATCH 15/35] Define DeviceResources in Python pkg

---
 cpp/include/nvforest/device_resources.hpp     |  5 +++
 python/nvforest/CMakeLists.txt                |  6 +++
 python/nvforest/nvforest/__init__.py          |  4 +-
 python/nvforest/nvforest/_device_resources.py | 10 +++++
 python/nvforest/nvforest/_factory.py          | 36 ++++++++---------
 python/nvforest/nvforest/_forest_inference.py | 40 +++++++++----------
 python/nvforest/nvforest/_handle.py           |  9 -----
 .../nvforest/detail/device_resources.pxd      | 20 ++++++++++
 .../nvforest/detail/device_resources.pyx      | 17 ++++++++
 .../nvforest/detail/forest_inference.pyx      | 37 +++++++----------
 10 files changed, 112 insertions(+), 72 deletions(-)
 create mode 100644 python/nvforest/nvforest/_device_resources.py
 delete mode 100644 python/nvforest/nvforest/_handle.py
 create mode 100644 python/nvforest/nvforest/detail/device_resources.pxd
 create mode 100644 python/nvforest/nvforest/detail/device_resources.pyx

diff --git a/cpp/include/nvforest/device_resources.hpp b/cpp/include/nvforest/device_resources.hpp
index 04621d3..ebe4543 100644
--- a/cpp/include/nvforest/device_resources.hpp
+++ b/cpp/include/nvforest/device_resources.hpp
@@ -14,6 +14,11 @@ class device_resources {
 
   auto get_next_usable_stream() const { return res_.get_next_usable_stream(); }
   auto get_stream_pool_size() const { return res_.get_stream_pool_size(); }
+  void synchronize() const
+  {
+    res_.sync_stream_pool();
+    res_.sync_stream();
+  }
 
  private:
   raft::device_resources res_;
diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt
index fd548e5..1b37788 100644
--- a/python/nvforest/CMakeLists.txt
+++ b/python/nvforest/CMakeLists.txt
@@ -77,3 +77,9 @@ rapids_cython_create_modules(
   SOURCE_FILES "nvforest/detail/treelite.pyx"
   LINKED_LIBRARIES "${linked_libraries}"
   INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_)
+
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "nvforest/detail/device_resources.pyx"
+  LINKED_LIBRARIES "${linked_libraries}"
+  INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_)
diff --git a/python/nvforest/nvforest/__init__.py b/python/nvforest/nvforest/__init__.py
index 4fc9b15..27a222a 100644
--- a/python/nvforest/nvforest/__init__.py
+++ b/python/nvforest/nvforest/__init__.py
@@ -13,6 +13,7 @@
     libnvforest.load_library()
     del libnvforest
 
+from nvforest._device_resources import DeviceResources
 from nvforest._factory import (
     load_from_sklearn,
     load_from_treelite_model,
@@ -24,7 +25,6 @@
     GPUForestInferenceClassifier,
     GPUForestInferenceRegressor,
 )
-from nvforest._handle import Handle
 from nvforest._version import __git_commit__, __version__
 
 __all__ = [
@@ -32,7 +32,7 @@
     "CPUForestInferenceRegressor",
     "GPUForestInferenceClassifier",
     "GPUForestInferenceRegressor",
-    "Handle",
+    "DeviceResources",
     "load_model",
     "load_from_sklearn",
     "load_from_treelite_model",
diff --git a/python/nvforest/nvforest/_device_resources.py b/python/nvforest/nvforest/_device_resources.py
new file mode 100644
index 0000000..85991ba
--- /dev/null
+++ b/python/nvforest/nvforest/_device_resources.py
@@ -0,0 +1,10 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from nvforest.detail.device_resources import (
+    DeviceResources as _DeviceResources,
+)
+
+DeviceResources = _DeviceResources
diff --git a/python/nvforest/nvforest/_factory.py b/python/nvforest/nvforest/_factory.py
index cc0700a..e640470 100644
--- a/python/nvforest/nvforest/_factory.py
+++ b/python/nvforest/nvforest/_factory.py
@@ -9,6 +9,7 @@
 import treelite
 
 from nvforest._base import ForestInference
+from nvforest._device_resources import DeviceResources
 from nvforest._forest_inference import (
     CPUForestInferenceClassifier,
     CPUForestInferenceRegressor,
@@ -17,7 +18,6 @@
     infer_device,
     infer_is_classifier,
 )
-from nvforest._handle import Handle
 
 
 def get_forest_inference_class(device, is_classifier) -> type:
@@ -39,7 +39,7 @@ def make_forest_inference_object(
     treelite_model: treelite.Model,
     device: str,
     device_id: Optional[int],
-    handle: Optional[Handle],
+    resource: Optional[DeviceResources],
     layout: str,
     default_chunk_size: Optional[int],
     align_bytes: Optional[int],
@@ -50,7 +50,7 @@ def make_forest_inference_object(
 
     kwargs = dict(
         treelite_model=treelite_model,
-        handle=handle,
+        resource=resource,
         layout=layout,
         default_chunk_size=default_chunk_size,
         align_bytes=align_bytes,
@@ -72,7 +72,7 @@ def load_model(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    handle: Optional[Handle] = None,
+    resource: Optional[DeviceResources] = None,
 ) -> ForestInference:
     """Load a model into nvForest from a serialized model file.
 
@@ -113,10 +113,10 @@ def load_model(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    handle : nvforest.Handle or None
-        For GPU execution, the nvForest handle containing the stream or stream
+    resource : nvforest.DeviceResources or None
+        For GPU execution, the device resource containing the stream or stream
         pool to use during loading and inference. If not given, a new
-        handle will be constructed.
+        resource will be constructed.
     """
     model_path = pathlib.Path(model_file)
     if not model_path.exists():
@@ -157,7 +157,7 @@ def load_model(
         treelite_model=tl_model,
         device=device,
         device_id=device_id,
-        handle=handle,
+        resource=resource,
         layout=layout,
         default_chunk_size=default_chunk_size,
         align_bytes=align_bytes,
@@ -174,7 +174,7 @@ def load_from_sklearn(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    handle: Optional[Handle] = None,
+    resource: Optional[DeviceResources] = None,
 ) -> ForestInference:
     """Load a Scikit-Learn forest model to nvForest
 
@@ -208,10 +208,10 @@ def load_from_sklearn(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    handle : nvforest.Handle or None
-        For GPU execution, the nvForest handle containing the stream or stream
+    resource : nvforest.DeviceResources or None
+        For GPU execution, the device resource containing the stream or stream
         pool to use during loading and inference. If not given, a new
-        handle will be constructed.
+        resource will be constructed.
     """
     tl_model = treelite.sklearn.import_model(skl_model)
 
@@ -219,7 +219,7 @@ def load_from_sklearn(
         treelite_model=tl_model,
         device=device,
         device_id=device_id,
-        handle=handle,
+        resource=resource,
         layout=layout,
         default_chunk_size=default_chunk_size,
         align_bytes=align_bytes,
@@ -236,7 +236,7 @@ def load_from_treelite_model(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    handle: Optional[Handle] = None,
+    resource: Optional[DeviceResources] = None,
 ) -> ForestInference:
     """Load a Treelite forest model to nvForest
 
@@ -270,16 +270,16 @@ def load_from_treelite_model(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    handle : nvforest.Handle or None
-        For GPU execution, the nvForest handle containing the stream or stream
+    resource : nvforest.DeviceResources or None
+        For GPU execution, the device resource containing the stream or stream
         pool to use during loading and inference. If not given, a new
-        handle will be constructed.
+        resource will be constructed.
     """
     return make_forest_inference_object(
         treelite_model=tl_model,
         device=device,
         device_id=device_id,
-        handle=handle,
+        resource=resource,
         layout=layout,
         default_chunk_size=default_chunk_size,
         align_bytes=align_bytes,
diff --git a/python/nvforest/nvforest/_forest_inference.py b/python/nvforest/nvforest/_forest_inference.py
index 2c3c933..b3de975 100644
--- a/python/nvforest/nvforest/_forest_inference.py
+++ b/python/nvforest/nvforest/_forest_inference.py
@@ -18,7 +18,7 @@
 from cuda.bindings import runtime
 
 from nvforest._base import ForestInferenceClassifier, ForestInferenceRegressor
-from nvforest._handle import Handle
+from nvforest._device_resources import DeviceResources
 from nvforest._typing import DataType
 from nvforest.detail.forest_inference import ForestInferenceImpl
 
@@ -123,7 +123,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        resource: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -250,7 +250,7 @@ def optimize(
             else:
                 test_instances[layout] = type(self)._create_with_layout(
                     treelite_model_bytes=self.forest.treelite_model_bytes,
-                    handle=self.forest.handle,
+                    resource=self.forest.resource,
                     layout=layout,
                     default_chunk_size=None,
                     align_bytes=self.forest.align_bytes,
@@ -293,7 +293,7 @@ def optimize(
         # Return a new instance with optimal settings
         return type(self)._create_with_layout(
             treelite_model_bytes=self.forest.treelite_model_bytes,
-            handle=self.forest.handle,
+            resource=self.forest.resource,
             layout=optimal_layout,
             default_chunk_size=optimal_chunk_size,
             align_bytes=self.forest.align_bytes,
@@ -310,7 +310,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[Handle] = None,
+        resource: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -322,7 +322,7 @@ def __init__(
             treelite_model=treelite_model,
             device="cpu",
             device_id=-1,
-            handle=handle,
+            resource=resource,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -334,7 +334,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        resource: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -346,7 +346,7 @@ def _create_with_layout(
         tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes)
         return cls(
             treelite_model=tl_model,
-            handle=handle,
+            resource=resource,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -425,7 +425,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[Handle] = None,
+        resource: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -437,7 +437,7 @@ def __init__(
             treelite_model=treelite_model,
             device="cpu",
             device_id=-1,
-            handle=handle,
+            resource=resource,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -449,7 +449,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        resource: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -461,7 +461,7 @@ def _create_with_layout(
         tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes)
         return cls(
             treelite_model=tl_model,
-            handle=handle,
+            resource=resource,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -532,7 +532,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[Handle] = None,
+        resource: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -545,7 +545,7 @@ def __init__(
             treelite_model=treelite_model,
             device="gpu",
             device_id=device_id,
-            handle=handle,
+            resource=resource,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -557,7 +557,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        resource: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -569,7 +569,7 @@ def _create_with_layout(
         tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes)
         return cls(
             treelite_model=tl_model,
-            handle=handle,
+            resource=resource,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -649,7 +649,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[Handle] = None,
+        resource: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -662,7 +662,7 @@ def __init__(
             treelite_model=treelite_model,
             device="gpu",
             device_id=device_id,
-            handle=handle,
+            resource=resource,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -674,7 +674,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        resource: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -686,7 +686,7 @@ def _create_with_layout(
         tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes)
         return cls(
             treelite_model=tl_model,
-            handle=handle,
+            resource=resource,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py
deleted file mode 100644
index 0633eee..0000000
--- a/python/nvforest/nvforest/_handle.py
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-#
-
-from pylibraft.common.handle import DeviceResources as RaftDeviceResources
-
-# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.DeviceResources
-Handle = RaftDeviceResources
diff --git a/python/nvforest/nvforest/detail/device_resources.pxd b/python/nvforest/nvforest/detail/device_resources.pxd
new file mode 100644
index 0000000..8aa4aef
--- /dev/null
+++ b/python/nvforest/nvforest/detail/device_resources.pxd
@@ -0,0 +1,20 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from libcpp.memory cimport unique_ptr
+
+from nvforest.detail.raft_proto.cuda_stream cimport (
+    cuda_stream as raft_proto_stream_t,
+)
+
+
+cdef extern from "nvforest/device_resources.hpp" namespace "nvforest" nogil:
+    cdef cppclass device_resources:
+        device_resources() except +
+        raft_proto_stream_t get_next_usable_stream() except +
+        void synchronize() except +
+
+cdef class DeviceResources:
+    cdef unique_ptr[device_resources] c_obj
diff --git a/python/nvforest/nvforest/detail/device_resources.pyx b/python/nvforest/nvforest/detail/device_resources.pyx
new file mode 100644
index 0000000..98ca786
--- /dev/null
+++ b/python/nvforest/nvforest/detail/device_resources.pyx
@@ -0,0 +1,17 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+cdef class DeviceResources:
+    def __cinit__(self, c_obj=None):
+        self.c_obj.reset(new device_resources())
+
+    def __getstate__(self):
+        return object()
+
+    def __setstate__(self, state):
+        self.c_obj.reset(new device_resources())
+
+    def get_c_obj(self):
+        return <size_t> self.c_obj.get()
diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx
index 946c3fa..557d3ee 100644
--- a/python/nvforest/nvforest/detail/forest_inference.pyx
+++ b/python/nvforest/nvforest/detail/forest_inference.pyx
@@ -8,15 +8,15 @@ from typing import Optional, Union
 import numpy as np
 import treelite
 
-from nvforest._handle import Handle
+from nvforest._device_resources import DeviceResources
 from nvforest._typing import DataType
 from nvforest.detail.treelite import safe_treelite_call
 
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t, uintptr_t
 from libcpp cimport bool
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 
+from nvforest.detail.device_resources cimport device_resources
 from nvforest.detail.infer_kind cimport infer_kind
 from nvforest.detail.postprocessing cimport element_op, row_op
 from nvforest.detail.raft_proto.cuda_stream cimport (
@@ -34,14 +34,6 @@ from nvforest.detail.treelite cimport (
 )
 
 
-cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil:
-    cdef cppclass device_resources:
-        device_resources() except +
-        cuda_stream_view get_next_usable_stream() except +
-        void sync_stream() except +
-        void sync_stream_pool() except +
-
-
 cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil:
     cdef cppclass forest_model:
         void predict[io_t](
@@ -77,13 +69,13 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil:
 
 cdef class ForestInference_impl():
     cdef forest_model model
-    cdef object py_handle
-    cdef device_resources* c_handle
+    cdef object py_resource
+    cdef device_resources* c_resource
     cdef object device
 
     def __cinit__(
         self,
-        handle: object,
+        resource: object,
         tl_model_bytes: Union[bytes, bytearray],
         *,
         layout: str = "depth_first",
@@ -92,8 +84,8 @@ cdef class ForestInference_impl():
         device: str = "cpu",
         device_id: Optional[int] = None,
     ):
-        self.py_handle = handle
-        self.c_handle = <device_resources*><size_t>self.py_handle.getHandle()
+        self.py_resource = resource
+        self.c_resource = <device_resources*><size_t>self.py_resource.get_c_obj()
 
         cdef optional[bool] use_double_precision_c
         cdef bool use_double_precision_bool
@@ -138,7 +130,7 @@ cdef class ForestInference_impl():
             use_double_precision_c,
             dev_type,
             device_id,
-            <raft_proto_stream_t> self.c_handle.get_next_usable_stream().value()
+            self.c_resource.get_next_usable_stream()
         )
 
         safe_treelite_call(
@@ -248,7 +240,7 @@ cdef class ForestInference_impl():
 
         if model_dtype == np.float32:
             self.model.predict[float](
-                deref(self.c_handle),
+                deref(self.c_resource),
                 <float *> out_ptr,
                 <float *> in_ptr,
                 n_rows,
@@ -259,7 +251,7 @@ cdef class ForestInference_impl():
             )
         else:
             self.model.predict[double](
-                deref(self.c_handle),
+                deref(self.c_resource),
                 <double *> out_ptr,
                 <double *> in_ptr,
                 n_rows,
@@ -270,8 +262,7 @@ cdef class ForestInference_impl():
             )
 
         if self.device == "gpu":
-            self.c_handle.sync_stream_pool()
-            self.c_handle.sync_stream()
+            self.c_resource.synchronize()
         return preds
 
 
@@ -282,7 +273,7 @@ class ForestInferenceImpl:
         treelite_model: treelite.Model,
         device: str,
         device_id: int,
-        handle: Optional[Handle] = None,
+        resource: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -290,7 +281,7 @@ class ForestInferenceImpl:
     ):
         # Assumption: The caller needs to pass in correct (device, device_id) pair
         # This function will not contain any logic for auto-detecting device.
-        self.handle = Handle() if handle is None else handle
+        self.resource = DeviceResources() if resource is None else resource
         self._layout = layout
         self.precision = precision
         self.default_chunk_size = default_chunk_size
@@ -318,7 +309,7 @@ class ForestInferenceImpl:
         self._treelite_model_bytes = treelite_model.serialize_bytes()
 
         self.impl = ForestInference_impl(
-            self.handle,
+            self.resource,
             self._treelite_model_bytes,
             layout=self._layout,
             align_bytes=self.align_bytes,

From ed7627e2f3026ee997d0ddf7248a4fa2438cee62 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 21 Apr 2026 19:18:57 -0700
Subject: [PATCH 16/35] Update doc

---
 docs/source/getting_started.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 3257387..11d70d3 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -206,7 +206,7 @@ Now that the tree model is fully imported into nvForest, let's run inference:
 .. note:: Reuse the resource handle to reduce overhead
 
     nvForest internally creates a resource handle
-    (:cpp:class:`raft::device_resources`)
+    (:cpp:class:`nvforest::device_resources`)
     to manage GPU resources. Creation of the resource handle adds a slight
     performance overhead. If you plan to call
     :cpp:func:`~nvforest::forest_model::predict` multiple times, consider
@@ -215,9 +215,9 @@ Now that the tree model is fully imported into nvForest, let's run inference:
 
     .. code-block:: cpp
 
-        #include <raft/core/device_resources.hpp>
+        #include <nvforest/device_resources.hpp>
 
-        auto resource = raft::device_resources{};
+        auto resource = nvforest::device_resources{};
 
         // Calling predict multiple times ...
         fm.predict(resource, output, input, num_rows,

From 65b83cea10de1911ac9761bca39dc0c25c940969 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Sat, 2 May 2026 01:20:04 -0700
Subject: [PATCH 17/35] Revert "Define DeviceResources in Python pkg"

This reverts commit 13ddf90417114f9426f15147b162bd86520e9b2d.
---
 cpp/include/nvforest/device_resources.hpp     |  5 ---
 python/nvforest/CMakeLists.txt                |  6 ---
 python/nvforest/nvforest/__init__.py          |  4 +-
 python/nvforest/nvforest/_device_resources.py | 10 -----
 python/nvforest/nvforest/_factory.py          | 36 ++++++++---------
 python/nvforest/nvforest/_forest_inference.py | 40 +++++++++----------
 python/nvforest/nvforest/_handle.py           |  9 +++++
 .../nvforest/detail/device_resources.pxd      | 20 ----------
 .../nvforest/detail/device_resources.pyx      | 17 --------
 .../nvforest/detail/forest_inference.pyx      | 37 ++++++++++-------
 10 files changed, 72 insertions(+), 112 deletions(-)
 delete mode 100644 python/nvforest/nvforest/_device_resources.py
 create mode 100644 python/nvforest/nvforest/_handle.py
 delete mode 100644 python/nvforest/nvforest/detail/device_resources.pxd
 delete mode 100644 python/nvforest/nvforest/detail/device_resources.pyx

diff --git a/cpp/include/nvforest/device_resources.hpp b/cpp/include/nvforest/device_resources.hpp
index ebe4543..04621d3 100644
--- a/cpp/include/nvforest/device_resources.hpp
+++ b/cpp/include/nvforest/device_resources.hpp
@@ -14,11 +14,6 @@ class device_resources {
 
   auto get_next_usable_stream() const { return res_.get_next_usable_stream(); }
   auto get_stream_pool_size() const { return res_.get_stream_pool_size(); }
-  void synchronize() const
-  {
-    res_.sync_stream_pool();
-    res_.sync_stream();
-  }
 
  private:
   raft::device_resources res_;
diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt
index 1b37788..fd548e5 100644
--- a/python/nvforest/CMakeLists.txt
+++ b/python/nvforest/CMakeLists.txt
@@ -77,9 +77,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "nvforest/detail/treelite.pyx"
   LINKED_LIBRARIES "${linked_libraries}"
   INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_)
-
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "nvforest/detail/device_resources.pyx"
-  LINKED_LIBRARIES "${linked_libraries}"
-  INSTALL_DIR nvforest/detail MODULE_PREFIX nvforest_)
diff --git a/python/nvforest/nvforest/__init__.py b/python/nvforest/nvforest/__init__.py
index 27a222a..4fc9b15 100644
--- a/python/nvforest/nvforest/__init__.py
+++ b/python/nvforest/nvforest/__init__.py
@@ -13,7 +13,6 @@
     libnvforest.load_library()
     del libnvforest
 
-from nvforest._device_resources import DeviceResources
 from nvforest._factory import (
     load_from_sklearn,
     load_from_treelite_model,
@@ -25,6 +24,7 @@
     GPUForestInferenceClassifier,
     GPUForestInferenceRegressor,
 )
+from nvforest._handle import Handle
 from nvforest._version import __git_commit__, __version__
 
 __all__ = [
@@ -32,7 +32,7 @@
     "CPUForestInferenceRegressor",
     "GPUForestInferenceClassifier",
     "GPUForestInferenceRegressor",
-    "DeviceResources",
+    "Handle",
     "load_model",
     "load_from_sklearn",
     "load_from_treelite_model",
diff --git a/python/nvforest/nvforest/_device_resources.py b/python/nvforest/nvforest/_device_resources.py
deleted file mode 100644
index 85991ba..0000000
--- a/python/nvforest/nvforest/_device_resources.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-#
-
-from nvforest.detail.device_resources import (
-    DeviceResources as _DeviceResources,
-)
-
-DeviceResources = _DeviceResources
diff --git a/python/nvforest/nvforest/_factory.py b/python/nvforest/nvforest/_factory.py
index e640470..cc0700a 100644
--- a/python/nvforest/nvforest/_factory.py
+++ b/python/nvforest/nvforest/_factory.py
@@ -9,7 +9,6 @@
 import treelite
 
 from nvforest._base import ForestInference
-from nvforest._device_resources import DeviceResources
 from nvforest._forest_inference import (
     CPUForestInferenceClassifier,
     CPUForestInferenceRegressor,
@@ -18,6 +17,7 @@
     infer_device,
     infer_is_classifier,
 )
+from nvforest._handle import Handle
 
 
 def get_forest_inference_class(device, is_classifier) -> type:
@@ -39,7 +39,7 @@ def make_forest_inference_object(
     treelite_model: treelite.Model,
     device: str,
     device_id: Optional[int],
-    resource: Optional[DeviceResources],
+    handle: Optional[Handle],
     layout: str,
     default_chunk_size: Optional[int],
     align_bytes: Optional[int],
@@ -50,7 +50,7 @@ def make_forest_inference_object(
 
     kwargs = dict(
         treelite_model=treelite_model,
-        resource=resource,
+        handle=handle,
         layout=layout,
         default_chunk_size=default_chunk_size,
         align_bytes=align_bytes,
@@ -72,7 +72,7 @@ def load_model(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    resource: Optional[DeviceResources] = None,
+    handle: Optional[Handle] = None,
 ) -> ForestInference:
     """Load a model into nvForest from a serialized model file.
 
@@ -113,10 +113,10 @@ def load_model(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    resource : nvforest.DeviceResources or None
-        For GPU execution, the device resource containing the stream or stream
+    handle : nvforest.Handle or None
+        For GPU execution, the nvForest handle containing the stream or stream
         pool to use during loading and inference. If not given, a new
-        resource will be constructed.
+        handle will be constructed.
     """
     model_path = pathlib.Path(model_file)
     if not model_path.exists():
@@ -157,7 +157,7 @@ def load_model(
         treelite_model=tl_model,
         device=device,
         device_id=device_id,
-        resource=resource,
+        handle=handle,
         layout=layout,
         default_chunk_size=default_chunk_size,
         align_bytes=align_bytes,
@@ -174,7 +174,7 @@ def load_from_sklearn(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    resource: Optional[DeviceResources] = None,
+    handle: Optional[Handle] = None,
 ) -> ForestInference:
     """Load a Scikit-Learn forest model to nvForest
 
@@ -208,10 +208,10 @@ def load_from_sklearn(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    resource : nvforest.DeviceResources or None
-        For GPU execution, the device resource containing the stream or stream
+    handle : nvforest.Handle or None
+        For GPU execution, the nvForest handle containing the stream or stream
         pool to use during loading and inference. If not given, a new
-        resource will be constructed.
+        handle will be constructed.
     """
     tl_model = treelite.sklearn.import_model(skl_model)
 
@@ -219,7 +219,7 @@ def load_from_sklearn(
         treelite_model=tl_model,
         device=device,
         device_id=device_id,
-        resource=resource,
+        handle=handle,
         layout=layout,
         default_chunk_size=default_chunk_size,
         align_bytes=align_bytes,
@@ -236,7 +236,7 @@ def load_from_treelite_model(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    resource: Optional[DeviceResources] = None,
+    handle: Optional[Handle] = None,
 ) -> ForestInference:
     """Load a Treelite forest model to nvForest
 
@@ -270,16 +270,16 @@ def load_from_treelite_model(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    resource : nvforest.DeviceResources or None
-        For GPU execution, the device resource containing the stream or stream
+    handle : nvforest.Handle or None
+        For GPU execution, the nvForest handle containing the stream or stream
         pool to use during loading and inference. If not given, a new
-        resource will be constructed.
+        handle will be constructed.
     """
     return make_forest_inference_object(
         treelite_model=tl_model,
         device=device,
         device_id=device_id,
-        resource=resource,
+        handle=handle,
         layout=layout,
         default_chunk_size=default_chunk_size,
         align_bytes=align_bytes,
diff --git a/python/nvforest/nvforest/_forest_inference.py b/python/nvforest/nvforest/_forest_inference.py
index b3de975..2c3c933 100644
--- a/python/nvforest/nvforest/_forest_inference.py
+++ b/python/nvforest/nvforest/_forest_inference.py
@@ -18,7 +18,7 @@
 from cuda.bindings import runtime
 
 from nvforest._base import ForestInferenceClassifier, ForestInferenceRegressor
-from nvforest._device_resources import DeviceResources
+from nvforest._handle import Handle
 from nvforest._typing import DataType
 from nvforest.detail.forest_inference import ForestInferenceImpl
 
@@ -123,7 +123,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        resource: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -250,7 +250,7 @@ def optimize(
             else:
                 test_instances[layout] = type(self)._create_with_layout(
                     treelite_model_bytes=self.forest.treelite_model_bytes,
-                    resource=self.forest.resource,
+                    handle=self.forest.handle,
                     layout=layout,
                     default_chunk_size=None,
                     align_bytes=self.forest.align_bytes,
@@ -293,7 +293,7 @@ def optimize(
         # Return a new instance with optimal settings
         return type(self)._create_with_layout(
             treelite_model_bytes=self.forest.treelite_model_bytes,
-            resource=self.forest.resource,
+            handle=self.forest.handle,
             layout=optimal_layout,
             default_chunk_size=optimal_chunk_size,
             align_bytes=self.forest.align_bytes,
@@ -310,7 +310,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        resource: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -322,7 +322,7 @@ def __init__(
             treelite_model=treelite_model,
             device="cpu",
             device_id=-1,
-            resource=resource,
+            handle=handle,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -334,7 +334,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        resource: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -346,7 +346,7 @@ def _create_with_layout(
         tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes)
         return cls(
             treelite_model=tl_model,
-            resource=resource,
+            handle=handle,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -425,7 +425,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        resource: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -437,7 +437,7 @@ def __init__(
             treelite_model=treelite_model,
             device="cpu",
             device_id=-1,
-            resource=resource,
+            handle=handle,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -449,7 +449,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        resource: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -461,7 +461,7 @@ def _create_with_layout(
         tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes)
         return cls(
             treelite_model=tl_model,
-            resource=resource,
+            handle=handle,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -532,7 +532,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        resource: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -545,7 +545,7 @@ def __init__(
             treelite_model=treelite_model,
             device="gpu",
             device_id=device_id,
-            resource=resource,
+            handle=handle,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -557,7 +557,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        resource: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -569,7 +569,7 @@ def _create_with_layout(
         tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes)
         return cls(
             treelite_model=tl_model,
-            resource=resource,
+            handle=handle,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -649,7 +649,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        resource: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -662,7 +662,7 @@ def __init__(
             treelite_model=treelite_model,
             device="gpu",
             device_id=device_id,
-            resource=resource,
+            handle=handle,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
@@ -674,7 +674,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        resource: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -686,7 +686,7 @@ def _create_with_layout(
         tl_model = treelite.Model.deserialize_bytes(treelite_model_bytes)
         return cls(
             treelite_model=tl_model,
-            resource=resource,
+            handle=handle,
             layout=layout,
             default_chunk_size=default_chunk_size,
             align_bytes=align_bytes,
diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py
new file mode 100644
index 0000000..0633eee
--- /dev/null
+++ b/python/nvforest/nvforest/_handle.py
@@ -0,0 +1,9 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from pylibraft.common.handle import DeviceResources as RaftDeviceResources
+
+# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.DeviceResources
+Handle = RaftDeviceResources
diff --git a/python/nvforest/nvforest/detail/device_resources.pxd b/python/nvforest/nvforest/detail/device_resources.pxd
deleted file mode 100644
index 8aa4aef..0000000
--- a/python/nvforest/nvforest/detail/device_resources.pxd
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-#
-
-from libcpp.memory cimport unique_ptr
-
-from nvforest.detail.raft_proto.cuda_stream cimport (
-    cuda_stream as raft_proto_stream_t,
-)
-
-
-cdef extern from "nvforest/device_resources.hpp" namespace "nvforest" nogil:
-    cdef cppclass device_resources:
-        device_resources() except +
-        raft_proto_stream_t get_next_usable_stream() except +
-        void synchronize() except +
-
-cdef class DeviceResources:
-    cdef unique_ptr[device_resources] c_obj
diff --git a/python/nvforest/nvforest/detail/device_resources.pyx b/python/nvforest/nvforest/detail/device_resources.pyx
deleted file mode 100644
index 98ca786..0000000
--- a/python/nvforest/nvforest/detail/device_resources.pyx
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-#
-
-cdef class DeviceResources:
-    def __cinit__(self, c_obj=None):
-        self.c_obj.reset(new device_resources())
-
-    def __getstate__(self):
-        return object()
-
-    def __setstate__(self, state):
-        self.c_obj.reset(new device_resources())
-
-    def get_c_obj(self):
-        return <size_t> self.c_obj.get()
diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx
index 557d3ee..946c3fa 100644
--- a/python/nvforest/nvforest/detail/forest_inference.pyx
+++ b/python/nvforest/nvforest/detail/forest_inference.pyx
@@ -8,15 +8,15 @@ from typing import Optional, Union
 import numpy as np
 import treelite
 
-from nvforest._device_resources import DeviceResources
+from nvforest._handle import Handle
 from nvforest._typing import DataType
 from nvforest.detail.treelite import safe_treelite_call
 
 from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t, uintptr_t
 from libcpp cimport bool
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 
-from nvforest.detail.device_resources cimport device_resources
 from nvforest.detail.infer_kind cimport infer_kind
 from nvforest.detail.postprocessing cimport element_op, row_op
 from nvforest.detail.raft_proto.cuda_stream cimport (
@@ -34,6 +34,14 @@ from nvforest.detail.treelite cimport (
 )
 
 
+cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil:
+    cdef cppclass device_resources:
+        device_resources() except +
+        cuda_stream_view get_next_usable_stream() except +
+        void sync_stream() except +
+        void sync_stream_pool() except +
+
+
 cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil:
     cdef cppclass forest_model:
         void predict[io_t](
@@ -69,13 +77,13 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil:
 
 cdef class ForestInference_impl():
     cdef forest_model model
-    cdef object py_resource
-    cdef device_resources* c_resource
+    cdef object py_handle
+    cdef device_resources* c_handle
     cdef object device
 
     def __cinit__(
         self,
-        resource: object,
+        handle: object,
         tl_model_bytes: Union[bytes, bytearray],
         *,
         layout: str = "depth_first",
@@ -84,8 +92,8 @@ cdef class ForestInference_impl():
         device: str = "cpu",
         device_id: Optional[int] = None,
     ):
-        self.py_resource = resource
-        self.c_resource = <device_resources*><size_t>self.py_resource.get_c_obj()
+        self.py_handle = handle
+        self.c_handle = <device_resources*><size_t>self.py_handle.getHandle()
 
         cdef optional[bool] use_double_precision_c
         cdef bool use_double_precision_bool
@@ -130,7 +138,7 @@ cdef class ForestInference_impl():
             use_double_precision_c,
             dev_type,
             device_id,
-            self.c_resource.get_next_usable_stream()
+            <raft_proto_stream_t> self.c_handle.get_next_usable_stream().value()
         )
 
         safe_treelite_call(
@@ -240,7 +248,7 @@ cdef class ForestInference_impl():
 
         if model_dtype == np.float32:
             self.model.predict[float](
-                deref(self.c_resource),
+                deref(self.c_handle),
                 <float *> out_ptr,
                 <float *> in_ptr,
                 n_rows,
@@ -251,7 +259,7 @@ cdef class ForestInference_impl():
             )
         else:
             self.model.predict[double](
-                deref(self.c_resource),
+                deref(self.c_handle),
                 <double *> out_ptr,
                 <double *> in_ptr,
                 n_rows,
@@ -262,7 +270,8 @@ cdef class ForestInference_impl():
             )
 
         if self.device == "gpu":
-            self.c_resource.synchronize()
+            self.c_handle.sync_stream_pool()
+            self.c_handle.sync_stream()
         return preds
 
 
@@ -273,7 +282,7 @@ class ForestInferenceImpl:
         treelite_model: treelite.Model,
         device: str,
         device_id: int,
-        resource: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -281,7 +290,7 @@ class ForestInferenceImpl:
     ):
         # Assumption: The caller needs to pass in correct (device, device_id) pair
         # This function will not contain any logic for auto-detecting device.
-        self.resource = DeviceResources() if resource is None else resource
+        self.handle = Handle() if handle is None else handle
         self._layout = layout
         self.precision = precision
         self.default_chunk_size = default_chunk_size
@@ -309,7 +318,7 @@ class ForestInferenceImpl:
         self._treelite_model_bytes = treelite_model.serialize_bytes()
 
         self.impl = ForestInference_impl(
-            self.resource,
+            self.handle,
             self._treelite_model_bytes,
             layout=self._layout,
             align_bytes=self.align_bytes,

From 0c314cb421e5ec44b82e91c5a321a47625f7cf59 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Sat, 2 May 2026 01:20:07 -0700
Subject: [PATCH 18/35] Revert "Create device_resource wrapper in C++"

This reverts commit 564e38c4f9900838a6cb8ea1f99c813e484075ec.
---
 cpp/include/nvforest/device_resources.hpp | 22 ----------------------
 cpp/include/nvforest/forest_model.hpp     | 15 ++++++++-------
 cpp/tests/treelite_importer.cpp           |  5 +++--
 3 files changed, 11 insertions(+), 31 deletions(-)
 delete mode 100644 cpp/include/nvforest/device_resources.hpp

diff --git a/cpp/include/nvforest/device_resources.hpp b/cpp/include/nvforest/device_resources.hpp
deleted file mode 100644
index 04621d3..0000000
--- a/cpp/include/nvforest/device_resources.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-
-#include <raft/core/device_resources.hpp>
-
-namespace nvforest {
-
-class device_resources {
- public:
-  device_resources() : res_{} {}
-
-  auto get_next_usable_stream() const { return res_.get_next_usable_stream(); }
-  auto get_stream_pool_size() const { return res_.get_stream_pool_size(); }
-
- private:
-  raft::device_resources res_;
-};
-
-}  // namespace nvforest
diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 84987a8..be45584 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -7,9 +7,10 @@
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/raft_proto/buffer.hpp>
 #include <nvforest/detail/raft_proto/gpu_support.hpp>
-#include <nvforest/device_resources.hpp>
 #include <nvforest/infer_kind.hpp>
 
+#include <raft/core/device_resources.hpp>
+
 #include <cstddef>
 #include <type_traits>
 #include <variant>
@@ -152,7 +153,7 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] resource device resource which will be used to provide
+   * @param[in] resource RAFT resource which will be used to provide
    * streams for evaluation.
    * @param[out] output The buffer where model output should be stored. If
    * this buffer is on host while the model is on device or vice versa,
@@ -177,7 +178,7 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(nvforest::device_resources const& resource,
+  void predict(raft::device_resources const& resource,
                raft_proto::buffer<io_t>& output,
                raft_proto::buffer<io_t> const& input,
                infer_kind predict_type                        = infer_kind::default_kind,
@@ -253,7 +254,7 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] resource device resource which will be used to provide
+   * @param[in] resource RAFT resource which will be used to provide
    * streams for evaluation.
    * @param[out] output Pointer to the memory location where output should end
    * up
@@ -277,7 +278,7 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(nvforest::device_resources const& resource,
+  void predict(raft::device_resources const& resource,
                io_t* output,
                io_t* input,
                std::size_t num_rows,
@@ -301,7 +302,7 @@ struct forest_model {
   }
 
   /**
-   * Perform inference on given input (with auto-instantiated device resource)
+   * Perform inference on given input (with auto-instantiated RAFT resource)
    *
    * @param[out] output Pointer to the memory location where output should end
    * up
@@ -333,7 +334,7 @@ struct forest_model {
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
-    auto resource = nvforest::device_resources{};
+    auto resource = raft::device_resources{};
     predict(resource,
             output,
             input,
diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp
index d79c423..f6f3413 100644
--- a/cpp/tests/treelite_importer.cpp
+++ b/cpp/tests/treelite_importer.cpp
@@ -4,11 +4,12 @@
  */
 
 #include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/device_resources.hpp>
 #include <nvforest/postproc_ops.hpp>
 #include <nvforest/tree_layout.hpp>
 #include <nvforest/treelite_importer.hpp>
 
+#include <raft/core/device_resources.hpp>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <treelite/enum/task_type.h>
@@ -338,7 +339,7 @@ TEST(TreeliteImporter, DegenerateTree)
   auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
   ASSERT_FALSE(fil_model.has_vector_leaves());
 
-  auto resource       = nvforest::device_resources{};
+  auto resource       = raft::device_resources{};
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(1, 0.0);
   auto expected_preds = std::vector<double>{1.0};

From b3fd3be434c2d53e1a3fd27df1f39bb44b0fbc50 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Sat, 2 May 2026 02:32:24 -0700
Subject: [PATCH 19/35] Cache auto-instantiated RAFT resource

---
 cpp/include/nvforest/forest_model.hpp |  12 ++-
 cpp/tests/treelite_importer.cpp       | 107 +++++++++++++-------------
 2 files changed, 64 insertions(+), 55 deletions(-)

diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index be45584..0fd75a4 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -12,6 +12,7 @@
 #include <raft/core/device_resources.hpp>
 
 #include <cstddef>
+#include <memory>
 #include <type_traits>
 #include <variant>
 
@@ -26,7 +27,7 @@ namespace nvforest {
 struct forest_model {
   /** Wrap a decision_forest in a full forest_model object */
   forest_model(decision_forest_variant&& forest = decision_forest_variant{})
-    : decision_forest_{forest}
+    : decision_forest_{forest}, cached_device_resources_{}
   {
   }
 
@@ -334,8 +335,11 @@ struct forest_model {
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
-    auto resource = raft::device_resources{};
-    predict(resource,
+    // Auto-instantiate RAFT resource and cache it
+    if (!cached_device_resources_) {
+      cached_device_resources_ = std::make_unique<raft::device_resources>();
+    }
+    predict(*cached_device_resources_,
             output,
             input,
             num_rows,
@@ -347,6 +351,8 @@ struct forest_model {
 
  private:
   decision_forest_variant decision_forest_;
+  // Cache for auto-instantiated RAFT device resource
+  std::unique_ptr<raft::device_resources> cached_device_resources_;
 };
 
 }  // namespace nvforest
diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp
index f6f3413..219ba2f 100644
--- a/cpp/tests/treelite_importer.cpp
+++ b/cpp/tests/treelite_importer.cpp
@@ -249,45 +249,45 @@ auto static const SAMPLE_FOREST               = []() {
 
 TEST(TreeliteImporter, depth_first)
 {
-  auto fil_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::depth_first);
-  ASSERT_EQ(fil_model.num_features(), 7);
-  ASSERT_EQ(fil_model.num_outputs(), 1);
-  ASSERT_EQ(fil_model.num_trees(), 6);
-  ASSERT_FALSE(fil_model.has_vector_leaves());
-  ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable);
-  ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable);
-  ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu);
-  ASSERT_EQ(fil_model.device_index(), -1);
-  ASSERT_FALSE(fil_model.is_double_precision());
+  auto nvforest_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::depth_first);
+  ASSERT_EQ(nvforest_model.num_features(), 7);
+  ASSERT_EQ(nvforest_model.num_outputs(), 1);
+  ASSERT_EQ(nvforest_model.num_trees(), 6);
+  ASSERT_FALSE(nvforest_model.has_vector_leaves());
+  ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable);
+  ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable);
+  ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu);
+  ASSERT_EQ(nvforest_model.device_index(), -1);
+  ASSERT_FALSE(nvforest_model.is_double_precision());
 }
 
 TEST(TreeliteImporter, breadth_first)
 {
-  auto fil_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::breadth_first);
-  ASSERT_EQ(fil_model.num_features(), 7);
-  ASSERT_EQ(fil_model.num_outputs(), 1);
-  ASSERT_EQ(fil_model.num_trees(), 6);
-  ASSERT_FALSE(fil_model.has_vector_leaves());
-  ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable);
-  ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable);
-  ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu);
-  ASSERT_EQ(fil_model.device_index(), -1);
-  ASSERT_FALSE(fil_model.is_double_precision());
+  auto nvforest_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::breadth_first);
+  ASSERT_EQ(nvforest_model.num_features(), 7);
+  ASSERT_EQ(nvforest_model.num_outputs(), 1);
+  ASSERT_EQ(nvforest_model.num_trees(), 6);
+  ASSERT_FALSE(nvforest_model.has_vector_leaves());
+  ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable);
+  ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable);
+  ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu);
+  ASSERT_EQ(nvforest_model.device_index(), -1);
+  ASSERT_FALSE(nvforest_model.is_double_precision());
 }
 
 TEST(TreeliteImporter, layered_children_together)
 {
-  auto fil_model =
+  auto nvforest_model =
     import_from_treelite_model(*SAMPLE_FOREST, tree_layout::layered_children_together);
-  ASSERT_EQ(fil_model.num_features(), 7);
-  ASSERT_EQ(fil_model.num_outputs(), 1);
-  ASSERT_EQ(fil_model.num_trees(), 6);
-  ASSERT_FALSE(fil_model.has_vector_leaves());
-  ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable);
-  ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable);
-  ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu);
-  ASSERT_EQ(fil_model.device_index(), -1);
-  ASSERT_FALSE(fil_model.is_double_precision());
+  ASSERT_EQ(nvforest_model.num_features(), 7);
+  ASSERT_EQ(nvforest_model.num_outputs(), 1);
+  ASSERT_EQ(nvforest_model.num_trees(), 6);
+  ASSERT_FALSE(nvforest_model.has_vector_leaves());
+  ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable);
+  ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable);
+  ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu);
+  ASSERT_EQ(nvforest_model.device_index(), -1);
+  ASSERT_FALSE(nvforest_model.is_double_precision());
 }
 
 template <bool use_leaf_vector, typename leaf_t>
@@ -335,42 +335,45 @@ auto make_degenerate_tree(const leaf_t& leaf)
 
 TEST(TreeliteImporter, DegenerateTree)
 {
-  auto tl_model  = make_degenerate_tree<false>(1.0);
-  auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
-  ASSERT_FALSE(fil_model.has_vector_leaves());
+  auto tl_model       = make_degenerate_tree<false>(1.0);
+  auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
+  ASSERT_FALSE(nvforest_model.has_vector_leaves());
 
   auto resource       = raft::device_resources{};
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(1, 0.0);
   auto expected_preds = std::vector<double>{1.0};
-  fil_model.predict(resource,
-                    preds.data(),
-                    X.data(),
-                    1,
-                    raft_proto::device_type::cpu,
-                    raft_proto::device_type::cpu,
-                    nvforest::infer_kind::default_kind,
-                    1);
+  nvforest_model.predict(resource,
+                         preds.data(),
+                         X.data(),
+                         1,
+                         raft_proto::device_type::cpu,
+                         raft_proto::device_type::cpu,
+                         nvforest::infer_kind::default_kind,
+                         1);
   ASSERT_EQ(preds, expected_preds);
 }
 
 TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
 {
-  auto tl_model  = make_degenerate_tree<true>(std::vector<double>{0.5, 0.5});
-  auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
-  ASSERT_TRUE(fil_model.has_vector_leaves());
+  auto tl_model       = make_degenerate_tree<true>(std::vector<double>{0.5, 0.5});
+  auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
+  ASSERT_TRUE(nvforest_model.has_vector_leaves());
 
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(2, 0.0);
   auto expected_preds = std::vector<double>{0.5, 0.5};
-  fil_model.predict(preds.data(),
-                    X.data(),
-                    1,
-                    raft_proto::device_type::cpu,
-                    raft_proto::device_type::cpu,
-                    nvforest::infer_kind::default_kind,
-                    1);
-  ASSERT_EQ(preds, expected_preds);
+  for (int i = 0; i < 3; ++i) {
+    // Make sure that auto-instantiated RAFT resource gets cached properly
+    nvforest_model.predict(preds.data(),
+                           X.data(),
+                           1,
+                           raft_proto::device_type::cpu,
+                           raft_proto::device_type::cpu,
+                           nvforest::infer_kind::default_kind,
+                           1);
+    ASSERT_EQ(preds, expected_preds);
+  }
 }
 
 }  // namespace nvforest

From a1f05d329effeb6e2d2d5b9252684d52beaf83fb Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Sat, 2 May 2026 02:33:47 -0700
Subject: [PATCH 20/35] Remove the note about RAFT resource

---
 docs/source/getting_started.rst | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 11d70d3..e94b666 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -202,24 +202,3 @@ Now that the tree model is fully imported into nvForest, let's run inference:
     fm.predict(output, input, num_rows,
                raft_proto::device_type::gpu, raft_proto::device_type::gpu,
                nvforest::infer_kind::default_kind);
-
-.. note:: Reuse the resource handle to reduce overhead
-
-    nvForest internally creates a resource handle
-    (:cpp:class:`nvforest::device_resources`)
-    to manage GPU resources. Creation of the resource handle adds a slight
-    performance overhead. If you plan to call
-    :cpp:func:`~nvforest::forest_model::predict` multiple times, consider
-    creating the resource handle explicitly and re-using the handle between
-    the function calls.
-
-    .. code-block:: cpp
-
-        #include <nvforest/device_resources.hpp>
-
-        auto resource = nvforest::device_resources{};
-
-        // Calling predict multiple times ...
-        fm.predict(resource, output, input, num_rows,
-                   raft_proto::device_type::gpu, raft_proto::device_type::gpu,
-                   nvforest::infer_kind::default_kind);

From 5fc3d6d69383a9e9ca8b62733cd0a4ae40940d37 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Sat, 2 May 2026 02:57:14 -0700
Subject: [PATCH 21/35] Add sync for auto-instantiated RAFT resource

---
 cpp/include/nvforest/forest_model.hpp         |  6 ++++-
 cpp/tests/CMakeLists.txt                      |  2 +-
 ...lite_importer.cpp => treelite_importer.cu} | 25 +++++++++++++------
 3 files changed, 23 insertions(+), 10 deletions(-)
 rename cpp/tests/{treelite_importer.cpp => treelite_importer.cu} (93%)

diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 0fd75a4..cb51a26 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -303,7 +303,9 @@ struct forest_model {
   }
 
   /**
-   * Perform inference on given input (with auto-instantiated RAFT resource)
+   * Perform inference on given input (with auto-instantiated RAFT resource).
+   * Note. This function is blocking and will synchronize the underlying RAFT
+   * resource at return time.
    *
    * @param[out] output Pointer to the memory location where output should end
    * up
@@ -347,6 +349,8 @@ struct forest_model {
             in_mem_type,
             predict_type,
             specified_chunk_size);
+    cached_device_resources_->sync_stream_pool();
+    cached_device_resources_->sync_stream();
   }
 
  private:
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 18bef95..db7d7f8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -84,6 +84,6 @@ ConfigureTest(NAME HOST_BUFFER_TEST raft_proto/buffer.cpp)
 ConfigureTest(NAME DEVICE_BUFFER_TEST raft_proto/buffer.cu)
 ConfigureTest(NAME FOREST_TRAVERSAL_TEST forest/traversal_forest.cpp)
 ConfigureTest(NAME TREELITE_TRAVERSAL_TEST forest/treelite_traversal.cpp)
-ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cpp)
+ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cu)
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libnvforest)
diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cu
similarity index 93%
rename from cpp/tests/treelite_importer.cpp
rename to cpp/tests/treelite_importer.cu
index 219ba2f..69af84d 100644
--- a/cpp/tests/treelite_importer.cpp
+++ b/cpp/tests/treelite_importer.cu
@@ -10,6 +10,8 @@
 
 #include <raft/core/device_resources.hpp>
 
+#include <thrust/device_vector.h>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <treelite/enum/task_type.h>
@@ -357,22 +359,29 @@ TEST(TreeliteImporter, DegenerateTree)
 TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
 {
   auto tl_model       = make_degenerate_tree<true>(std::vector<double>{0.5, 0.5});
-  auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
+  auto nvforest_model = import_from_treelite_model(*tl_model,
+                                                   tree_layout::breadth_first,
+                                                   index_type{},
+                                                   std::nullopt,
+                                                   raft_proto::device_type::gpu);
   ASSERT_TRUE(nvforest_model.has_vector_leaves());
 
-  auto X              = std::vector<double>{0.0};
-  auto preds          = std::vector<double>(2, 0.0);
+  auto X              = thrust::device_vector<double>{0.0};
+  auto preds          = thrust::device_vector<double>(2, 0.0);
+  auto h_preds        = std::vector<double>(2);
   auto expected_preds = std::vector<double>{0.5, 0.5};
   for (int i = 0; i < 3; ++i) {
     // Make sure that auto-instantiated RAFT resource gets cached properly
-    nvforest_model.predict(preds.data(),
-                           X.data(),
+    // For this interface, predict() will synchronize the stream automatically.
+    nvforest_model.predict(thrust::raw_pointer_cast(preds.data()),
+                           thrust::raw_pointer_cast(X.data()),
                            1,
-                           raft_proto::device_type::cpu,
-                           raft_proto::device_type::cpu,
+                           raft_proto::device_type::gpu,
+                           raft_proto::device_type::gpu,
                            nvforest::infer_kind::default_kind,
                            1);
-    ASSERT_EQ(preds, expected_preds);
+    thrust::copy(preds.begin(), preds.end(), h_preds.begin());
+    ASSERT_EQ(h_preds, expected_preds);
   }
 }
 

From 3be33b9340b3129894fa213ede7ec29c6c39d995 Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 02:19:54 +0000
Subject: [PATCH 22/35] Fix RAFT resource stream conversion

---
 cpp/include/nvforest/forest_model.hpp | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index cef399c..9f5a951 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -196,8 +196,11 @@ struct forest_model {
         using model_io_t = typename std::remove_reference_t<decltype(concrete_forest)>::io_type;
         if constexpr (std::is_same_v<model_io_t, io_t>) {
           if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) {
-            concrete_forest.predict(
-              output, input, resource.get_next_usable_stream(), predict_type, specified_chunk_size);
+            concrete_forest.predict(output,
+                                    input,
+                                    get_next_raft_proto_stream(resource),
+                                    predict_type,
+                                    specified_chunk_size);
           } else {
             auto constexpr static const MIN_CHUNKS_PER_PARTITION = std::size_t{64};
             auto constexpr static const MAX_CHUNK_SIZE           = std::size_t{64};
@@ -209,7 +212,7 @@ struct forest_model {
                        specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION);
             auto partition_count = raft_proto::ceildiv(row_count, partition_size);
             for (auto i = std::size_t{}; i < partition_count; ++i) {
-              auto stream = resource.get_next_usable_stream();
+              auto stream = get_next_raft_proto_stream(resource);
               auto rows_in_this_partition =
                 std::min(partition_size, row_count - i * partition_size);
               auto partition_in = raft_proto::buffer<io_t>{};
@@ -363,6 +366,17 @@ struct forest_model {
   }
 
  private:
+  static raft_proto::cuda_stream get_next_raft_proto_stream(
+    raft::device_resources const& resource)
+  {
+#ifdef NVFOREST_ENABLE_GPU
+    return resource.get_next_usable_stream().value();
+#else
+    (void)resource;
+    return raft_proto::cuda_stream{};
+#endif
+  }
+
   decision_forest_variant decision_forest_;
   // Cache for auto-instantiated RAFT device resource
   std::unique_ptr<raft::device_resources> cached_device_resources_;

From bb6616ad73679aaa2ebd741e06ae235570dcb1c7 Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 02:30:40 +0000
Subject: [PATCH 23/35] Move RAFT stream adapter into raft_proto

---
 .../nvforest/detail/raft_proto/resources.hpp  | 23 +++++++++++++++++++
 cpp/include/nvforest/forest_model.hpp         | 18 +++------------
 2 files changed, 26 insertions(+), 15 deletions(-)
 create mode 100644 cpp/include/nvforest/detail/raft_proto/resources.hpp

diff --git a/cpp/include/nvforest/detail/raft_proto/resources.hpp b/cpp/include/nvforest/detail/raft_proto/resources.hpp
new file mode 100644
index 0000000..5ab952c
--- /dev/null
+++ b/cpp/include/nvforest/detail/raft_proto/resources.hpp
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <nvforest/detail/raft_proto/cuda_stream.hpp>
+
+#include <raft/core/device_resources.hpp>
+
+namespace raft_proto {
+
+inline cuda_stream get_next_usable_stream(raft::device_resources const& resource)
+{
+#ifdef NVFOREST_ENABLE_GPU
+  return resource.get_next_usable_stream().value();
+#else
+  (void)resource;
+  return cuda_stream{};
+#endif
+}
+
+}  // namespace raft_proto
diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 9f5a951..6692f75 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -8,10 +8,9 @@
 #include <nvforest/detail/raft_proto/buffer.hpp>
 #include <nvforest/detail/raft_proto/cuda_check.hpp>
 #include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/detail/raft_proto/resources.hpp>
 #include <nvforest/infer_kind.hpp>
 
-#include <raft/core/device_resources.hpp>
-
 #ifdef NVFOREST_ENABLE_GPU
 #include <cuda_runtime_api.h>
 #endif
@@ -198,7 +197,7 @@ struct forest_model {
           if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) {
             concrete_forest.predict(output,
                                     input,
-                                    get_next_raft_proto_stream(resource),
+                                    raft_proto::get_next_usable_stream(resource),
                                     predict_type,
                                     specified_chunk_size);
           } else {
@@ -212,7 +211,7 @@ struct forest_model {
                        specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION);
             auto partition_count = raft_proto::ceildiv(row_count, partition_size);
             for (auto i = std::size_t{}; i < partition_count; ++i) {
-              auto stream = get_next_raft_proto_stream(resource);
+              auto stream = raft_proto::get_next_usable_stream(resource);
               auto rows_in_this_partition =
                 std::min(partition_size, row_count - i * partition_size);
               auto partition_in = raft_proto::buffer<io_t>{};
@@ -366,17 +365,6 @@ struct forest_model {
   }
 
  private:
-  static raft_proto::cuda_stream get_next_raft_proto_stream(
-    raft::device_resources const& resource)
-  {
-#ifdef NVFOREST_ENABLE_GPU
-    return resource.get_next_usable_stream().value();
-#else
-    (void)resource;
-    return raft_proto::cuda_stream{};
-#endif
-  }
-
   decision_forest_variant decision_forest_;
   // Cache for auto-instantiated RAFT device resource
   std::unique_ptr<raft::device_resources> cached_device_resources_;

From afd5cab2d9e84277172cfcb2a597aea5fc7c1dac Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 13:55:14 +0000
Subject: [PATCH 24/35] Update C++ README for device resources API

---
 cpp/include/nvforest/README.md | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md
index 3e3df17..38eff4c 100644
--- a/cpp/include/nvforest/README.md
+++ b/cpp/include/nvforest/README.md
@@ -106,10 +106,7 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float));
 
 // Assuming that input is a float* pointing to data already located on-device
 
-auto handle = raft_proto::handle_t{};
-
 nvforest_model.predict(
-  handle,
   output,
   input,
   num_rows,
@@ -119,11 +116,10 @@ nvforest_model.predict(
 );
 ```
 
-**handle**: To provide a unified interface on CPU and GPU, we introduce
-`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a
-placeholder in CPU-only builds, and using it does not require any CUDA
-functionality. For GPU-enabled builds, you can construct a
-`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use.
+The overload shown above auto-instantiates and caches a `raft::device_resources`
+object. It synchronizes the resource before returning. Applications that need to
+control stream or stream-pool usage can instead pass an existing
+`raft::device_resources` object as the first argument.
 
 **output**: Pointer to pre-allocated buffer where results should be
 written. If the model has been loaded at single precision, this should be a

From 5af8cac4407b2a96dcd32481ecd62f2cf3c5ff96 Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 13:55:19 +0000
Subject: [PATCH 25/35] Remove stale Treelite importer handle include

---
 cpp/tests/treelite_importer.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/tests/treelite_importer.cu b/cpp/tests/treelite_importer.cu
index 307e000..69af84d 100644
--- a/cpp/tests/treelite_importer.cu
+++ b/cpp/tests/treelite_importer.cu
@@ -4,7 +4,6 @@
  */
 
 #include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/handle.hpp>
 #include <nvforest/postproc_ops.hpp>
 #include <nvforest/tree_layout.hpp>
 #include <nvforest/treelite_importer.hpp>

From a2111d29e619884583355d44577d72fa7542870b Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 14:23:50 +0000
Subject: [PATCH 26/35] Reject null prediction pointers

---
 cpp/include/nvforest/forest_model.hpp |  4 +++
 cpp/tests/treelite_importer.cu        | 38 +++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 6692f75..2d4ecff 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -295,6 +295,10 @@ struct forest_model {
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
+    if (num_rows != 0 && (output == nullptr || input == nullptr)) {
+      throw runtime_error{"Input and output pointers must be non-null when num_rows > 0"};
+    }
+
     int current_device_id;
     if (out_mem_type == raft_proto::device_type::gpu ||
         in_mem_type == raft_proto::device_type::gpu) {
diff --git a/cpp/tests/treelite_importer.cu b/cpp/tests/treelite_importer.cu
index 69af84d..ab18453 100644
--- a/cpp/tests/treelite_importer.cu
+++ b/cpp/tests/treelite_importer.cu
@@ -356,6 +356,44 @@ TEST(TreeliteImporter, DegenerateTree)
   ASSERT_EQ(preds, expected_preds);
 }
 
+TEST(TreeliteImporter, RejectNullPointers)
+{
+  auto tl_model       = make_degenerate_tree<false>(1.0);
+  auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
+
+  auto resource      = raft::device_resources{};
+  auto X             = std::vector<double>{0.0};
+  auto preds         = std::vector<double>(1, 0.0);
+  auto* null_pointer = static_cast<double*>(nullptr);
+
+  EXPECT_THROW(nvforest_model.predict(resource,
+                                      null_pointer,
+                                      X.data(),
+                                      1,
+                                      raft_proto::device_type::cpu,
+                                      raft_proto::device_type::cpu,
+                                      nvforest::infer_kind::default_kind,
+                                      1),
+               nvforest::runtime_error);
+  EXPECT_THROW(nvforest_model.predict(resource,
+                                      preds.data(),
+                                      null_pointer,
+                                      1,
+                                      raft_proto::device_type::cpu,
+                                      raft_proto::device_type::cpu,
+                                      nvforest::infer_kind::default_kind,
+                                      1),
+               nvforest::runtime_error);
+  EXPECT_THROW(nvforest_model.predict(null_pointer,
+                                      X.data(),
+                                      1,
+                                      raft_proto::device_type::cpu,
+                                      raft_proto::device_type::cpu,
+                                      nvforest::infer_kind::default_kind,
+                                      1),
+               nvforest::runtime_error);
+}
+
 TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
 {
   auto tl_model       = make_degenerate_tree<true>(std::vector<double>{0.5, 0.5});

From 6e2cc4275461c4623718d53cef6f65252d3cb3c0 Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 14:23:58 +0000
Subject: [PATCH 27/35] Document device resources migration path

---
 docs/source/getting_started.rst | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index e94b666..c3fce83 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -202,3 +202,17 @@ Now that the tree model is fully imported into nvForest, let's run inference:
     fm.predict(output, input, num_rows,
                raft_proto::device_type::gpu, raft_proto::device_type::gpu,
                nvforest::infer_kind::default_kind);
+
+The overload shown above auto-instantiates and caches a ``raft::device_resources``
+object and synchronizes before returning. Code that previously constructed a
+``raft_proto::handle_t`` should instead pass a ``raft::device_resources`` object
+directly when it needs to control CUDA stream or stream-pool usage:
+
+.. code-block:: cpp
+
+    #include <raft/core/device_resources.hpp>
+
+    raft::device_resources resource{};
+    fm.predict(resource, output, input, num_rows,
+               raft_proto::device_type::gpu, raft_proto::device_type::gpu,
+               nvforest::infer_kind::default_kind);

From 902c2ea5f2bdba4313796944ed30a9a4824342b1 Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 14:24:08 +0000
Subject: [PATCH 28/35] Clarify auto-resource test coverage

---
 cpp/tests/treelite_importer.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/treelite_importer.cu b/cpp/tests/treelite_importer.cu
index ab18453..71cd18e 100644
--- a/cpp/tests/treelite_importer.cu
+++ b/cpp/tests/treelite_importer.cu
@@ -409,8 +409,8 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
   auto h_preds        = std::vector<double>(2);
   auto expected_preds = std::vector<double>{0.5, 0.5};
   for (int i = 0; i < 3; ++i) {
-    // Make sure that auto-instantiated RAFT resource gets cached properly
-    // For this interface, predict() will synchronize the stream automatically.
+    // Repeatedly exercise the auto-resource overload. This interface
+    // synchronizes the stream before returning.
     nvforest_model.predict(thrust::raw_pointer_cast(preds.data()),
                            thrust::raw_pointer_cast(X.data()),
                            1,

From 779f8c23187e8796f05b41f8ca59e7203014b0c9 Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 14:31:36 +0000
Subject: [PATCH 29/35] Clarify device resources usage docs

---
 docs/source/getting_started.rst | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index c3fce83..46b1b27 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -204,9 +204,9 @@ Now that the tree model is fully imported into nvForest, let's run inference:
                nvforest::infer_kind::default_kind);
 
 The overload shown above auto-instantiates and caches a ``raft::device_resources``
-object and synchronizes before returning. Code that previously constructed a
-``raft_proto::handle_t`` should instead pass a ``raft::device_resources`` object
-directly when it needs to control CUDA stream or stream-pool usage:
+object and synchronizes before returning. This is the recommended path unless
+your application needs to control CUDA stream or stream-pool usage directly. For
+that advanced use case, pass an explicit ``raft::device_resources`` object:
 
 .. code-block:: cpp
 
@@ -216,3 +216,14 @@ directly when it needs to control CUDA stream or stream-pool usage:
     fm.predict(resource, output, input, num_rows,
                raft_proto::device_type::gpu, raft_proto::device_type::gpu,
                nvforest::infer_kind::default_kind);
+
+.. note::
+
+    In version 26.06, the C++ prediction API changed from ``raft_proto::handle_t``
+    to ``raft::device_resources``. Code that previously constructed a
+    ``raft_proto::handle_t`` should use the no-resource overload shown above, or
+    if the previous call site relied on explicit CUDA stream or stream-pool
+    control, replace calls of the form
+    ``fm.predict(handle, output, input, num_rows, ...)`` with
+    ``fm.predict(resource, output, input, num_rows, ...)`` using
+    ``raft::device_resources``.

From 397835f7a570c67d8e0e32ef8d7811fb9852705d Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 15:05:46 +0000
Subject: [PATCH 30/35] Document RAFT resource migration debt

---
 cpp/include/nvforest/detail/raft_proto/resources.hpp | 4 ++++
 cpp/include/nvforest/forest_model.hpp                | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/cpp/include/nvforest/detail/raft_proto/resources.hpp b/cpp/include/nvforest/detail/raft_proto/resources.hpp
index 5ab952c..091acc8 100644
--- a/cpp/include/nvforest/detail/raft_proto/resources.hpp
+++ b/cpp/include/nvforest/detail/raft_proto/resources.hpp
@@ -10,6 +10,10 @@
 
 namespace raft_proto {
 
+// TODO(nvforest#121): This is a compatibility bridge while nvForest accepts
+// raft::device_resources publicly but still uses raft_proto::cuda_stream
+// internally. Remove this once internal resource/stream handling is migrated to
+// RAFT/RMM abstractions consistently.
 inline cuda_stream get_next_usable_stream(raft::device_resources const& resource)
 {
 #ifdef NVFOREST_ENABLE_GPU
diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 2d4ecff..1c7f2a5 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -134,6 +134,9 @@ struct forest_model {
    * optimal value a priori. If omitted, a heuristic will be used to select a
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
+  // TODO(nvforest#121): This lower-level overload still exposes
+  // raft_proto::cuda_stream. Revisit it as part of the internal migration to
+  // consistent RAFT/RMM resource and stream abstractions.
   template <typename io_t>
   void predict(raft_proto::buffer<io_t>& output,
                raft_proto::buffer<io_t> const& input,

From 5f344d0726239e116c5ba86d332a4ba039867358 Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 15:09:20 +0000
Subject: [PATCH 31/35] Deprecate Python Handle alias

---
 python/nvforest/nvforest/__init__.py          | 17 +++++++++-
 python/nvforest/nvforest/_factory.py          | 34 +++++++++----------
 python/nvforest/nvforest/_forest_inference.py | 20 +++++------
 python/nvforest/nvforest/_handle.py           | 19 +++++++++--
 .../nvforest/detail/forest_inference.pyx      |  6 ++--
 5 files changed, 63 insertions(+), 33 deletions(-)

diff --git a/python/nvforest/nvforest/__init__.py b/python/nvforest/nvforest/__init__.py
index 4fc9b15..4fc3ff2 100644
--- a/python/nvforest/nvforest/__init__.py
+++ b/python/nvforest/nvforest/__init__.py
@@ -24,12 +24,13 @@
     GPUForestInferenceClassifier,
     GPUForestInferenceRegressor,
 )
-from nvforest._handle import Handle
+from nvforest._handle import DeviceResources
 from nvforest._version import __git_commit__, __version__
 
 __all__ = [
     "CPUForestInferenceClassifier",
     "CPUForestInferenceRegressor",
+    "DeviceResources",
     "GPUForestInferenceClassifier",
     "GPUForestInferenceRegressor",
     "Handle",
@@ -39,3 +40,17 @@
     "__git_commit__",
     "__version__",
 ]
+
+
+def __getattr__(name):
+    if name == "Handle":
+        import warnings
+
+        warnings.warn(
+            "nvforest.Handle was renamed to nvforest.DeviceResources in 26.06 "
+            "and will be removed in 26.08.",
+            FutureWarning,
+            stacklevel=2,
+        )
+        return DeviceResources
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/python/nvforest/nvforest/_factory.py b/python/nvforest/nvforest/_factory.py
index cc0700a..4943b91 100644
--- a/python/nvforest/nvforest/_factory.py
+++ b/python/nvforest/nvforest/_factory.py
@@ -17,7 +17,7 @@
     infer_device,
     infer_is_classifier,
 )
-from nvforest._handle import Handle
+from nvforest._handle import DeviceResources
 
 
 def get_forest_inference_class(device, is_classifier) -> type:
@@ -39,7 +39,7 @@ def make_forest_inference_object(
     treelite_model: treelite.Model,
     device: str,
     device_id: Optional[int],
-    handle: Optional[Handle],
+    handle: Optional[DeviceResources],
     layout: str,
     default_chunk_size: Optional[int],
     align_bytes: Optional[int],
@@ -72,7 +72,7 @@ def load_model(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    handle: Optional[Handle] = None,
+    handle: Optional[DeviceResources] = None,
 ) -> ForestInference:
     """Load a model into nvForest from a serialized model file.
 
@@ -113,10 +113,10 @@ def load_model(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    handle : nvforest.Handle or None
-        For GPU execution, the nvForest handle containing the stream or stream
-        pool to use during loading and inference. If not given, a new
-        handle will be constructed.
+    handle : nvforest.DeviceResources or None
+        For GPU execution, the nvForest device resources containing the stream
+        or stream pool to use during loading and inference. If not given, a new
+        device resources object will be constructed.
     """
     model_path = pathlib.Path(model_file)
     if not model_path.exists():
@@ -174,7 +174,7 @@ def load_from_sklearn(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    handle: Optional[Handle] = None,
+    handle: Optional[DeviceResources] = None,
 ) -> ForestInference:
     """Load a Scikit-Learn forest model to nvForest
 
@@ -208,10 +208,10 @@ def load_from_sklearn(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    handle : nvforest.Handle or None
-        For GPU execution, the nvForest handle containing the stream or stream
-        pool to use during loading and inference. If not given, a new
-        handle will be constructed.
+    handle : nvforest.DeviceResources or None
+        For GPU execution, the nvForest device resources containing the stream
+        or stream pool to use during loading and inference. If not given, a new
+        device resources object will be constructed.
     """
     tl_model = treelite.sklearn.import_model(skl_model)
 
@@ -236,7 +236,7 @@ def load_from_treelite_model(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    handle: Optional[Handle] = None,
+    handle: Optional[DeviceResources] = None,
 ) -> ForestInference:
     """Load a Treelite forest model to nvForest
 
@@ -270,10 +270,10 @@ def load_from_treelite_model(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    handle : nvforest.Handle or None
-        For GPU execution, the nvForest handle containing the stream or stream
-        pool to use during loading and inference. If not given, a new
-        handle will be constructed.
+    handle : nvforest.DeviceResources or None
+        For GPU execution, the nvForest device resources containing the stream
+        or stream pool to use during loading and inference. If not given, a new
+        device resources object will be constructed.
     """
     return make_forest_inference_object(
         treelite_model=tl_model,
diff --git a/python/nvforest/nvforest/_forest_inference.py b/python/nvforest/nvforest/_forest_inference.py
index 2c3c933..a7602ae 100644
--- a/python/nvforest/nvforest/_forest_inference.py
+++ b/python/nvforest/nvforest/_forest_inference.py
@@ -18,7 +18,7 @@
 from cuda.bindings import runtime
 
 from nvforest._base import ForestInferenceClassifier, ForestInferenceRegressor
-from nvforest._handle import Handle
+from nvforest._handle import DeviceResources
 from nvforest._typing import DataType
 from nvforest.detail.forest_inference import ForestInferenceImpl
 
@@ -123,7 +123,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        handle: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -310,7 +310,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[Handle] = None,
+        handle: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -334,7 +334,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        handle: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -425,7 +425,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[Handle] = None,
+        handle: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -449,7 +449,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        handle: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -532,7 +532,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[Handle] = None,
+        handle: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -557,7 +557,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        handle: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -649,7 +649,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[Handle] = None,
+        handle: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -674,7 +674,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[Handle],
+        handle: Optional[DeviceResources],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py
index 0633eee..878c48a 100644
--- a/python/nvforest/nvforest/_handle.py
+++ b/python/nvforest/nvforest/_handle.py
@@ -3,7 +3,22 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+import warnings
+
 from pylibraft.common.handle import DeviceResources as RaftDeviceResources
 
-# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.DeviceResources
-Handle = RaftDeviceResources
+DeviceResources = RaftDeviceResources
+
+__all__ = ["DeviceResources", "Handle"]
+
+
+def __getattr__(name):
+    if name == "Handle":
+        warnings.warn(
+            "nvforest.Handle was renamed to nvforest.DeviceResources in 26.06 "
+            "and will be removed in 26.08.",
+            FutureWarning,
+            stacklevel=2,
+        )
+        return DeviceResources
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx
index 946c3fa..3a513b6 100644
--- a/python/nvforest/nvforest/detail/forest_inference.pyx
+++ b/python/nvforest/nvforest/detail/forest_inference.pyx
@@ -8,7 +8,7 @@ from typing import Optional, Union
 import numpy as np
 import treelite
 
-from nvforest._handle import Handle
+from nvforest._handle import DeviceResources
 from nvforest._typing import DataType
 from nvforest.detail.treelite import safe_treelite_call
 
@@ -282,7 +282,7 @@ class ForestInferenceImpl:
         treelite_model: treelite.Model,
         device: str,
         device_id: int,
-        handle: Optional[Handle] = None,
+        handle: Optional[DeviceResources] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -290,7 +290,7 @@ class ForestInferenceImpl:
     ):
         # Assumption: The caller needs to pass in correct (device, device_id) pair
         # This function will not contain any logic for auto-detecting device.
-        self.handle = Handle() if handle is None else handle
+        self.handle = DeviceResources() if handle is None else handle
         self._layout = layout
         self.precision = precision
         self.default_chunk_size = default_chunk_size

From 121c40059abd74e0f4799a0b2eb3e42cf788797d Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 15:37:24 +0000
Subject: [PATCH 32/35] Build nvforest with C++20

---
 cpp/CMakeLists.txt       | 4 ++--
 cpp/tests/CMakeLists.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fb9d015..712d0a3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -257,9 +257,9 @@ set_target_properties(
   PROPERTIES BUILD_RPATH "\$ORIGIN"
              INSTALL_RPATH "\$ORIGIN"
              # set target compile options
-             CXX_STANDARD 17
+             CXX_STANDARD 20
              CXX_STANDARD_REQUIRED ON
-             CUDA_STANDARD 17
+             CUDA_STANDARD 20
              CUDA_STANDARD_REQUIRED ON
              POSITION_INDEPENDENT_CODE ON
              INTERFACE_POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index db7d7f8..4f4b256 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -62,9 +62,9 @@ function(ConfigureTest)
   set_target_properties(
     ${_NVFOREST_TEST_NAME}
     PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
-               CXX_STANDARD 17
+               CXX_STANDARD 20
                CXX_STANDARD_REQUIRED ON
-               CUDA_STANDARD 17
+               CUDA_STANDARD 20
                CUDA_STANDARD_REQUIRED ON)
 
   set(_NVFOREST_TEST_COMPONENT_NAME testing)

From 10499f25661b8ea3440a2251e966e2957734fd15 Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 15:57:57 +0000
Subject: [PATCH 33/35] Reduce PR 102 to auto-instantiation

---
 cpp/CMakeLists.txt                            |   4 +-
 cpp/include/nvforest/README.md                |  12 +-
 .../nvforest/detail/raft_proto/handle.hpp     |  43 +++++
 .../nvforest/detail/raft_proto/resources.hpp  |  27 ---
 cpp/include/nvforest/forest_model.hpp         |  85 +++------
 cpp/tests/CMakeLists.txt                      |   6 +-
 ...lite_importer.cu => treelite_importer.cpp} | 168 +++++++-----------
 docs/source/getting_started.rst               |  33 +---
 python/nvforest/nvforest/__init__.py          |  17 +-
 python/nvforest/nvforest/_factory.py          |  34 ++--
 python/nvforest/nvforest/_forest_inference.py |  20 +--
 python/nvforest/nvforest/_handle.py           |  21 +--
 .../nvforest/detail/forest_inference.pyx      |  43 ++---
 .../nvforest/detail/raft_proto/handle.pxd     |  19 ++
 14 files changed, 218 insertions(+), 314 deletions(-)
 create mode 100644 cpp/include/nvforest/detail/raft_proto/handle.hpp
 delete mode 100644 cpp/include/nvforest/detail/raft_proto/resources.hpp
 rename cpp/tests/{treelite_importer.cu => treelite_importer.cpp} (65%)
 create mode 100644 python/nvforest/nvforest/detail/raft_proto/handle.pxd

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 712d0a3..fb9d015 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -257,9 +257,9 @@ set_target_properties(
   PROPERTIES BUILD_RPATH "\$ORIGIN"
              INSTALL_RPATH "\$ORIGIN"
              # set target compile options
-             CXX_STANDARD 20
+             CXX_STANDARD 17
              CXX_STANDARD_REQUIRED ON
-             CUDA_STANDARD 20
+             CUDA_STANDARD 17
              CUDA_STANDARD_REQUIRED ON
              POSITION_INDEPENDENT_CODE ON
              INTERFACE_POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md
index 38eff4c..3e3df17 100644
--- a/cpp/include/nvforest/README.md
+++ b/cpp/include/nvforest/README.md
@@ -106,7 +106,10 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float));
 
 // Assuming that input is a float* pointing to data already located on-device
 
+auto handle = raft_proto::handle_t{};
+
 nvforest_model.predict(
+  handle,
   output,
   input,
   num_rows,
@@ -116,10 +119,11 @@ nvforest_model.predict(
 );
 ```
 
-The overload shown above auto-instantiates and caches a `raft::device_resources`
-object. It synchronizes the resource before returning. Applications that need to
-control stream or stream-pool usage can instead pass an existing
-`raft::device_resources` object as the first argument.
+**handle**: To provide a unified interface on CPU and GPU, we introduce
+`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a
+placeholder in CPU-only builds, and using it does not require any CUDA
+functionality. For GPU-enabled builds, you can construct a
+`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use.
 
 **output**: Pointer to pre-allocated buffer where results should be
 written. If the model has been loaded at single precision, this should be a
diff --git a/cpp/include/nvforest/detail/raft_proto/handle.hpp b/cpp/include/nvforest/detail/raft_proto/handle.hpp
new file mode 100644
index 0000000..086d61b
--- /dev/null
+++ b/cpp/include/nvforest/detail/raft_proto/handle.hpp
@@ -0,0 +1,43 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include <nvforest/detail/raft_proto/cuda_stream.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#ifdef NVFOREST_ENABLE_GPU
+#include <raft/core/handle.hpp>
+#endif
+
+namespace raft_proto {
+#ifdef NVFOREST_ENABLE_GPU
+struct handle_t {
+  handle_t(raft::handle_t const* handle_ptr = nullptr) : raft_handle_{handle_ptr} {}
+  handle_t(raft::handle_t const& raft_handle) : raft_handle_{&raft_handle} {}
+  auto get_next_usable_stream() const
+  {
+    return raft_proto::cuda_stream{raft_handle_->get_next_usable_stream().value()};
+  }
+  auto get_stream_pool_size() const { return raft_handle_->get_stream_pool_size(); }
+  auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); }
+  void synchronize() const
+  {
+    raft_handle_->sync_stream_pool();
+    raft_handle_->sync_stream();
+  }
+
+ private:
+  // Have to store a pointer because handle is not movable
+  raft::handle_t const* raft_handle_;
+};
+#else
+struct handle_t {
+  auto get_next_usable_stream() const { return raft_proto::cuda_stream{}; }
+  auto get_stream_pool_size() const { return std::size_t{}; }
+  auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); }
+  void synchronize() const {}
+};
+#endif
+}  // namespace raft_proto
diff --git a/cpp/include/nvforest/detail/raft_proto/resources.hpp b/cpp/include/nvforest/detail/raft_proto/resources.hpp
deleted file mode 100644
index 091acc8..0000000
--- a/cpp/include/nvforest/detail/raft_proto/resources.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-
-#include <raft/core/device_resources.hpp>
-
-namespace raft_proto {
-
-// TODO(nvforest#121): This is a compatibility bridge while nvForest accepts
-// raft::device_resources publicly but still uses raft_proto::cuda_stream
-// internally. Remove this once internal resource/stream handling is migrated to
-// RAFT/RMM abstractions consistently.
-inline cuda_stream get_next_usable_stream(raft::device_resources const& resource)
-{
-#ifdef NVFOREST_ENABLE_GPU
-  return resource.get_next_usable_stream().value();
-#else
-  (void)resource;
-  return cuda_stream{};
-#endif
-}
-
-}  // namespace raft_proto
diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 1c7f2a5..12c4738 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -8,7 +8,7 @@
 #include <nvforest/detail/raft_proto/buffer.hpp>
 #include <nvforest/detail/raft_proto/cuda_check.hpp>
 #include <nvforest/detail/raft_proto/gpu_support.hpp>
-#include <nvforest/detail/raft_proto/resources.hpp>
+#include <nvforest/detail/raft_proto/handle.hpp>
 #include <nvforest/infer_kind.hpp>
 
 #ifdef NVFOREST_ENABLE_GPU
@@ -16,7 +16,6 @@
 #endif
 
 #include <cstddef>
-#include <memory>
 #include <type_traits>
 #include <variant>
 
@@ -31,7 +30,7 @@ namespace nvforest {
 struct forest_model {
   /** Wrap a decision_forest in a full forest_model object */
   forest_model(decision_forest_variant&& forest = decision_forest_variant{})
-    : decision_forest_{forest}, cached_device_resources_{}
+    : decision_forest_{forest}
   {
   }
 
@@ -134,9 +133,6 @@ struct forest_model {
    * optimal value a priori. If omitted, a heuristic will be used to select a
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
-  // TODO(nvforest#121): This lower-level overload still exposes
-  // raft_proto::cuda_stream. Revisit it as part of the internal migration to
-  // consistent RAFT/RMM resource and stream abstractions.
   template <typename io_t>
   void predict(raft_proto::buffer<io_t>& output,
                raft_proto::buffer<io_t> const& input,
@@ -161,8 +157,8 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] resource RAFT resource which will be used to provide
-   * streams for evaluation.
+   * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t
+   * on GPU) which will be used to provide streams for evaluation.
    * @param[out] output The buffer where model output should be stored. If
    * this buffer is on host while the model is on device or vice versa,
    * work will be distributed across available streams to copy the data back
@@ -186,35 +182,31 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(raft::device_resources const& resource,
+  void predict(raft_proto::handle_t const& handle,
                raft_proto::buffer<io_t>& output,
                raft_proto::buffer<io_t> const& input,
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
     std::visit(
-      [this, predict_type, &resource, &output, &input, &specified_chunk_size](
+      [this, predict_type, &handle, &output, &input, &specified_chunk_size](
         auto&& concrete_forest) {
         using model_io_t = typename std::remove_reference_t<decltype(concrete_forest)>::io_type;
         if constexpr (std::is_same_v<model_io_t, io_t>) {
           if (output.memory_type() == memory_type() && input.memory_type() == memory_type()) {
-            concrete_forest.predict(output,
-                                    input,
-                                    raft_proto::get_next_usable_stream(resource),
-                                    predict_type,
-                                    specified_chunk_size);
+            concrete_forest.predict(
+              output, input, handle.get_next_usable_stream(), predict_type, specified_chunk_size);
           } else {
             auto constexpr static const MIN_CHUNKS_PER_PARTITION = std::size_t{64};
             auto constexpr static const MAX_CHUNK_SIZE           = std::size_t{64};
 
-            auto row_count           = input.size() / num_features();
-            auto usable_stream_count = std::max(resource.get_stream_pool_size(), std::size_t{1});
+            auto row_count = input.size() / num_features();
             auto partition_size =
-              std::max(raft_proto::ceildiv(row_count, usable_stream_count),
+              std::max(raft_proto::ceildiv(row_count, handle.get_usable_stream_count()),
                        specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION);
             auto partition_count = raft_proto::ceildiv(row_count, partition_size);
             for (auto i = std::size_t{}; i < partition_count; ++i) {
-              auto stream = raft_proto::get_next_usable_stream(resource);
+              auto stream = handle.get_next_usable_stream();
               auto rows_in_this_partition =
                 std::min(partition_size, row_count - i * partition_size);
               auto partition_in = raft_proto::buffer<io_t>{};
@@ -265,8 +257,8 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] resource RAFT resource which will be used to provide
-   * streams for evaluation.
+   * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t
+   * on GPU) which will be used to provide streams for evaluation.
    * @param[out] output Pointer to the memory location where output should end
    * up
    * @param[in] input Pointer to the input data
@@ -289,7 +281,7 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(raft::device_resources const& resource,
+  void predict(raft_proto::handle_t const& handle,
                io_t* output,
                io_t* input,
                std::size_t num_rows,
@@ -298,10 +290,6 @@ struct forest_model {
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
-    if (num_rows != 0 && (output == nullptr || input == nullptr)) {
-      throw runtime_error{"Input and output pointers must be non-null when num_rows > 0"};
-    }
-
     int current_device_id;
     if (out_mem_type == raft_proto::device_type::gpu ||
         in_mem_type == raft_proto::device_type::gpu) {
@@ -317,34 +305,12 @@ struct forest_model {
       raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id};
     auto in_buffer =
       raft_proto::buffer{input, num_rows * num_features(), in_mem_type, current_device_id};
-    predict(resource, out_buffer, in_buffer, predict_type, specified_chunk_size);
+    predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size);
   }
 
   /**
-   * Perform inference on given input (with auto-instantiated RAFT resource).
-   * Note. This function is blocking and will synchronize the underlying RAFT
-   * resource at return time.
-   *
-   * @param[out] output Pointer to the memory location where output should end
-   * up
-   * @param[in] input Pointer to the input data
-   * @param[in] num_rows Number of rows in input
-   * @param[in] out_mem_type The memory type (device/host) of the output
-   * buffer
-   * @param[in] in_mem_type The memory type (device/host) of the input buffer
-   * @param[in] predict_type Type of inference to perform. Defaults to summing
-   * the outputs of all trees and produce an output per row. If set to
-   * "per_tree", we will instead output all outputs of individual trees.
-   * If set to "leaf_id", we will output the integer ID of the leaf node
-   * for each tree.
-   * @param[in] specified_chunk_size: Specifies the mini-batch size for
-   * processing. This has different meanings on CPU and GPU, but on GPU it
-   * corresponds to the number of rows evaluated per inference iteration
-   * on a single block. It can take on any power of 2 from 1 to 32, and
-   * runtime performance is quite sensitive to the value chosen. In general,
-   * larger batches benefit from higher values, but it is hard to predict the
-   * optimal value a priori. If omitted, a heuristic will be used to select a
-   * reasonable value. On CPU, this argument can generally just be omitted.
+   * Perform inference on given input using an internally managed RAFT handle.
+   * This function is blocking and synchronizes the handle before returning.
    */
   template <typename io_t>
   void predict(io_t* output,
@@ -355,11 +321,13 @@ struct forest_model {
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
-    // Auto-instantiate RAFT resource and cache it
-    if (!cached_device_resources_) {
-      cached_device_resources_ = std::make_unique<raft::device_resources>();
-    }
-    predict(*cached_device_resources_,
+#ifdef NVFOREST_ENABLE_GPU
+    auto raft_handle = raft::handle_t{};
+    auto handle      = raft_proto::handle_t{raft_handle};
+#else
+    auto handle = raft_proto::handle_t{};
+#endif
+    predict(handle,
             output,
             input,
             num_rows,
@@ -367,14 +335,11 @@ struct forest_model {
             in_mem_type,
             predict_type,
             specified_chunk_size);
-    cached_device_resources_->sync_stream_pool();
-    cached_device_resources_->sync_stream();
+    handle.synchronize();
   }
 
  private:
   decision_forest_variant decision_forest_;
-  // Cache for auto-instantiated RAFT device resource
-  std::unique_ptr<raft::device_resources> cached_device_resources_;
 };
 
 }  // namespace nvforest
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4f4b256..18bef95 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -62,9 +62,9 @@ function(ConfigureTest)
   set_target_properties(
     ${_NVFOREST_TEST_NAME}
     PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
-               CXX_STANDARD 20
+               CXX_STANDARD 17
                CXX_STANDARD_REQUIRED ON
-               CUDA_STANDARD 20
+               CUDA_STANDARD 17
                CUDA_STANDARD_REQUIRED ON)
 
   set(_NVFOREST_TEST_COMPONENT_NAME testing)
@@ -84,6 +84,6 @@ ConfigureTest(NAME HOST_BUFFER_TEST raft_proto/buffer.cpp)
 ConfigureTest(NAME DEVICE_BUFFER_TEST raft_proto/buffer.cu)
 ConfigureTest(NAME FOREST_TRAVERSAL_TEST forest/traversal_forest.cpp)
 ConfigureTest(NAME TREELITE_TRAVERSAL_TEST forest/treelite_traversal.cpp)
-ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cu)
+ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cpp)
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libnvforest)
diff --git a/cpp/tests/treelite_importer.cu b/cpp/tests/treelite_importer.cpp
similarity index 65%
rename from cpp/tests/treelite_importer.cu
rename to cpp/tests/treelite_importer.cpp
index 71cd18e..0eee0db 100644
--- a/cpp/tests/treelite_importer.cu
+++ b/cpp/tests/treelite_importer.cpp
@@ -4,14 +4,11 @@
  */
 
 #include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/detail/raft_proto/handle.hpp>
 #include <nvforest/postproc_ops.hpp>
 #include <nvforest/tree_layout.hpp>
 #include <nvforest/treelite_importer.hpp>
 
-#include <raft/core/device_resources.hpp>
-
-#include <thrust/device_vector.h>
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <treelite/enum/task_type.h>
@@ -251,45 +248,45 @@ auto static const SAMPLE_FOREST               = []() {
 
 TEST(TreeliteImporter, depth_first)
 {
-  auto nvforest_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::depth_first);
-  ASSERT_EQ(nvforest_model.num_features(), 7);
-  ASSERT_EQ(nvforest_model.num_outputs(), 1);
-  ASSERT_EQ(nvforest_model.num_trees(), 6);
-  ASSERT_FALSE(nvforest_model.has_vector_leaves());
-  ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable);
-  ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable);
-  ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu);
-  ASSERT_EQ(nvforest_model.device_index(), -1);
-  ASSERT_FALSE(nvforest_model.is_double_precision());
+  auto fil_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::depth_first);
+  ASSERT_EQ(fil_model.num_features(), 7);
+  ASSERT_EQ(fil_model.num_outputs(), 1);
+  ASSERT_EQ(fil_model.num_trees(), 6);
+  ASSERT_FALSE(fil_model.has_vector_leaves());
+  ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable);
+  ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable);
+  ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu);
+  ASSERT_EQ(fil_model.device_index(), -1);
+  ASSERT_FALSE(fil_model.is_double_precision());
 }
 
 TEST(TreeliteImporter, breadth_first)
 {
-  auto nvforest_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::breadth_first);
-  ASSERT_EQ(nvforest_model.num_features(), 7);
-  ASSERT_EQ(nvforest_model.num_outputs(), 1);
-  ASSERT_EQ(nvforest_model.num_trees(), 6);
-  ASSERT_FALSE(nvforest_model.has_vector_leaves());
-  ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable);
-  ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable);
-  ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu);
-  ASSERT_EQ(nvforest_model.device_index(), -1);
-  ASSERT_FALSE(nvforest_model.is_double_precision());
+  auto fil_model = import_from_treelite_model(*SAMPLE_FOREST, tree_layout::breadth_first);
+  ASSERT_EQ(fil_model.num_features(), 7);
+  ASSERT_EQ(fil_model.num_outputs(), 1);
+  ASSERT_EQ(fil_model.num_trees(), 6);
+  ASSERT_FALSE(fil_model.has_vector_leaves());
+  ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable);
+  ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable);
+  ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu);
+  ASSERT_EQ(fil_model.device_index(), -1);
+  ASSERT_FALSE(fil_model.is_double_precision());
 }
 
 TEST(TreeliteImporter, layered_children_together)
 {
-  auto nvforest_model =
+  auto fil_model =
     import_from_treelite_model(*SAMPLE_FOREST, tree_layout::layered_children_together);
-  ASSERT_EQ(nvforest_model.num_features(), 7);
-  ASSERT_EQ(nvforest_model.num_outputs(), 1);
-  ASSERT_EQ(nvforest_model.num_trees(), 6);
-  ASSERT_FALSE(nvforest_model.has_vector_leaves());
-  ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable);
-  ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable);
-  ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu);
-  ASSERT_EQ(nvforest_model.device_index(), -1);
-  ASSERT_FALSE(nvforest_model.is_double_precision());
+  ASSERT_EQ(fil_model.num_features(), 7);
+  ASSERT_EQ(fil_model.num_outputs(), 1);
+  ASSERT_EQ(fil_model.num_trees(), 6);
+  ASSERT_FALSE(fil_model.has_vector_leaves());
+  ASSERT_EQ(fil_model.row_postprocessing(), row_op::disable);
+  ASSERT_EQ(fil_model.elem_postprocessing(), element_op::disable);
+  ASSERT_EQ(fil_model.memory_type(), raft_proto::device_type::cpu);
+  ASSERT_EQ(fil_model.device_index(), -1);
+  ASSERT_FALSE(fil_model.is_double_precision());
 }
 
 template <bool use_leaf_vector, typename leaf_t>
@@ -337,90 +334,47 @@ auto make_degenerate_tree(const leaf_t& leaf)
 
 TEST(TreeliteImporter, DegenerateTree)
 {
-  auto tl_model       = make_degenerate_tree<false>(1.0);
-  auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
-  ASSERT_FALSE(nvforest_model.has_vector_leaves());
+  auto tl_model  = make_degenerate_tree<false>(1.0);
+  auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
+  ASSERT_FALSE(fil_model.has_vector_leaves());
 
-  auto resource       = raft::device_resources{};
+#ifdef NVFOREST_ENABLE_GPU
+  auto raft_handle = raft::handle_t{};
+  auto handle      = raft_proto::handle_t{raft_handle};
+#else
+  auto handle = raft_proto::handle_t{};
+#endif
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(1, 0.0);
   auto expected_preds = std::vector<double>{1.0};
-  nvforest_model.predict(resource,
-                         preds.data(),
-                         X.data(),
-                         1,
-                         raft_proto::device_type::cpu,
-                         raft_proto::device_type::cpu,
-                         nvforest::infer_kind::default_kind,
-                         1);
+  fil_model.predict(handle,
+                    preds.data(),
+                    X.data(),
+                    1,
+                    raft_proto::device_type::cpu,
+                    raft_proto::device_type::cpu,
+                    nvforest::infer_kind::default_kind,
+                    1);
   ASSERT_EQ(preds, expected_preds);
 }
 
-TEST(TreeliteImporter, RejectNullPointers)
-{
-  auto tl_model       = make_degenerate_tree<false>(1.0);
-  auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
-
-  auto resource      = raft::device_resources{};
-  auto X             = std::vector<double>{0.0};
-  auto preds         = std::vector<double>(1, 0.0);
-  auto* null_pointer = static_cast<double*>(nullptr);
-
-  EXPECT_THROW(nvforest_model.predict(resource,
-                                      null_pointer,
-                                      X.data(),
-                                      1,
-                                      raft_proto::device_type::cpu,
-                                      raft_proto::device_type::cpu,
-                                      nvforest::infer_kind::default_kind,
-                                      1),
-               nvforest::runtime_error);
-  EXPECT_THROW(nvforest_model.predict(resource,
-                                      preds.data(),
-                                      null_pointer,
-                                      1,
-                                      raft_proto::device_type::cpu,
-                                      raft_proto::device_type::cpu,
-                                      nvforest::infer_kind::default_kind,
-                                      1),
-               nvforest::runtime_error);
-  EXPECT_THROW(nvforest_model.predict(null_pointer,
-                                      X.data(),
-                                      1,
-                                      raft_proto::device_type::cpu,
-                                      raft_proto::device_type::cpu,
-                                      nvforest::infer_kind::default_kind,
-                                      1),
-               nvforest::runtime_error);
-}
-
 TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
 {
-  auto tl_model       = make_degenerate_tree<true>(std::vector<double>{0.5, 0.5});
-  auto nvforest_model = import_from_treelite_model(*tl_model,
-                                                   tree_layout::breadth_first,
-                                                   index_type{},
-                                                   std::nullopt,
-                                                   raft_proto::device_type::gpu);
-  ASSERT_TRUE(nvforest_model.has_vector_leaves());
+  auto tl_model  = make_degenerate_tree<true>(std::vector<double>{0.5, 0.5});
+  auto fil_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
+  ASSERT_TRUE(fil_model.has_vector_leaves());
 
-  auto X              = thrust::device_vector<double>{0.0};
-  auto preds          = thrust::device_vector<double>(2, 0.0);
-  auto h_preds        = std::vector<double>(2);
+  auto X              = std::vector<double>{0.0};
+  auto preds          = std::vector<double>(2, 0.0);
   auto expected_preds = std::vector<double>{0.5, 0.5};
-  for (int i = 0; i < 3; ++i) {
-    // Repeatedly exercise the auto-resource overload. This interface
-    // synchronizes the stream before returning.
-    nvforest_model.predict(thrust::raw_pointer_cast(preds.data()),
-                           thrust::raw_pointer_cast(X.data()),
-                           1,
-                           raft_proto::device_type::gpu,
-                           raft_proto::device_type::gpu,
-                           nvforest::infer_kind::default_kind,
-                           1);
-    thrust::copy(preds.begin(), preds.end(), h_preds.begin());
-    ASSERT_EQ(h_preds, expected_preds);
-  }
+  fil_model.predict(preds.data(),
+                    X.data(),
+                    1,
+                    raft_proto::device_type::cpu,
+                    raft_proto::device_type::cpu,
+                    nvforest::infer_kind::default_kind,
+                    1);
+  ASSERT_EQ(preds, expected_preds);
 }
 
 }  // namespace nvforest
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 46b1b27..8500861 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -195,35 +195,16 @@ Now that the tree model is fully imported into nvForest, let's run inference:
 
 .. code-block:: cpp
 
+    #include <raft/core/handle.hpp>
+    #include <nvforest/detail/raft_proto/handle.hpp>
+
+    raft::handle_t raft_handle{};
+    raft_proto::handle_t handle{raft_handle};
+
     // Assumption:
     // * Both output and input are in the GPU memory.
     // * The input buffer should be of dimension (num_rows, num_features)
     // * The output buffer should be of dimension (num_rows, fm.num_outputs())
-    fm.predict(output, input, num_rows,
+    fm.predict(handle, output, input, num_rows,
                raft_proto::device_type::gpu, raft_proto::device_type::gpu,
                nvforest::infer_kind::default_kind);
-
-The overload shown above auto-instantiates and caches a ``raft::device_resources``
-object and synchronizes before returning. This is the recommended path unless
-your application needs to control CUDA stream or stream-pool usage directly. For
-that advanced use case, pass an explicit ``raft::device_resources`` object:
-
-.. code-block:: cpp
-
-    #include <raft/core/device_resources.hpp>
-
-    raft::device_resources resource{};
-    fm.predict(resource, output, input, num_rows,
-               raft_proto::device_type::gpu, raft_proto::device_type::gpu,
-               nvforest::infer_kind::default_kind);
-
-.. note::
-
-    In version 26.06, the C++ prediction API changed from ``raft_proto::handle_t``
-    to ``raft::device_resources``. Code that previously constructed a
-    ``raft_proto::handle_t`` should use the no-resource overload shown above, or
-    if the previous call site relied on explicit CUDA stream or stream-pool
-    control, replace calls of the form
-    ``fm.predict(handle, output, input, num_rows, ...)`` with
-    ``fm.predict(resource, output, input, num_rows, ...)`` using
-    ``raft::device_resources``.
diff --git a/python/nvforest/nvforest/__init__.py b/python/nvforest/nvforest/__init__.py
index 4fc3ff2..4fc9b15 100644
--- a/python/nvforest/nvforest/__init__.py
+++ b/python/nvforest/nvforest/__init__.py
@@ -24,13 +24,12 @@
     GPUForestInferenceClassifier,
     GPUForestInferenceRegressor,
 )
-from nvforest._handle import DeviceResources
+from nvforest._handle import Handle
 from nvforest._version import __git_commit__, __version__
 
 __all__ = [
     "CPUForestInferenceClassifier",
     "CPUForestInferenceRegressor",
-    "DeviceResources",
     "GPUForestInferenceClassifier",
     "GPUForestInferenceRegressor",
     "Handle",
@@ -40,17 +39,3 @@
     "__git_commit__",
     "__version__",
 ]
-
-
-def __getattr__(name):
-    if name == "Handle":
-        import warnings
-
-        warnings.warn(
-            "nvforest.Handle was renamed to nvforest.DeviceResources in 26.06 "
-            "and will be removed in 26.08.",
-            FutureWarning,
-            stacklevel=2,
-        )
-        return DeviceResources
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/python/nvforest/nvforest/_factory.py b/python/nvforest/nvforest/_factory.py
index 4943b91..cc0700a 100644
--- a/python/nvforest/nvforest/_factory.py
+++ b/python/nvforest/nvforest/_factory.py
@@ -17,7 +17,7 @@
     infer_device,
     infer_is_classifier,
 )
-from nvforest._handle import DeviceResources
+from nvforest._handle import Handle
 
 
 def get_forest_inference_class(device, is_classifier) -> type:
@@ -39,7 +39,7 @@ def make_forest_inference_object(
     treelite_model: treelite.Model,
     device: str,
     device_id: Optional[int],
-    handle: Optional[DeviceResources],
+    handle: Optional[Handle],
     layout: str,
     default_chunk_size: Optional[int],
     align_bytes: Optional[int],
@@ -72,7 +72,7 @@ def load_model(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    handle: Optional[DeviceResources] = None,
+    handle: Optional[Handle] = None,
 ) -> ForestInference:
     """Load a model into nvForest from a serialized model file.
 
@@ -113,10 +113,10 @@ def load_model(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    handle : nvforest.DeviceResources or None
-        For GPU execution, the nvForest device resources containing the stream
-        or stream pool to use during loading and inference. If not given, a new
-        device resources object will be constructed.
+    handle : nvforest.Handle or None
+        For GPU execution, the nvForest handle containing the stream or stream
+        pool to use during loading and inference. If not given, a new
+        handle will be constructed.
     """
     model_path = pathlib.Path(model_file)
     if not model_path.exists():
@@ -174,7 +174,7 @@ def load_from_sklearn(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    handle: Optional[DeviceResources] = None,
+    handle: Optional[Handle] = None,
 ) -> ForestInference:
     """Load a Scikit-Learn forest model to nvForest
 
@@ -208,10 +208,10 @@ def load_from_sklearn(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    handle : nvforest.DeviceResources or None
-        For GPU execution, the nvForest device resources containing the stream
-        or stream pool to use during loading and inference. If not given, a new
-        device resources object will be constructed.
+    handle : nvforest.Handle or None
+        For GPU execution, the nvForest handle containing the stream or stream
+        pool to use during loading and inference. If not given, a new
+        handle will be constructed.
     """
     tl_model = treelite.sklearn.import_model(skl_model)
 
@@ -236,7 +236,7 @@ def load_from_treelite_model(
     align_bytes: Optional[int] = None,
     precision: Optional[str] = None,
     device_id: Optional[int] = None,
-    handle: Optional[DeviceResources] = None,
+    handle: Optional[Handle] = None,
 ) -> ForestInference:
     """Load a Treelite forest model to nvForest
 
@@ -270,10 +270,10 @@ def load_from_treelite_model(
     device_id : int or None, default=None
         For GPU execution, the device on which to load and execute this
         model. For CPU execution, this value is currently ignored.
-    handle : nvforest.DeviceResources or None
-        For GPU execution, the nvForest device resources containing the stream
-        or stream pool to use during loading and inference. If not given, a new
-        device resources object will be constructed.
+    handle : nvforest.Handle or None
+        For GPU execution, the nvForest handle containing the stream or stream
+        pool to use during loading and inference. If not given, a new
+        handle will be constructed.
     """
     return make_forest_inference_object(
         treelite_model=tl_model,
diff --git a/python/nvforest/nvforest/_forest_inference.py b/python/nvforest/nvforest/_forest_inference.py
index a7602ae..2c3c933 100644
--- a/python/nvforest/nvforest/_forest_inference.py
+++ b/python/nvforest/nvforest/_forest_inference.py
@@ -18,7 +18,7 @@
 from cuda.bindings import runtime
 
 from nvforest._base import ForestInferenceClassifier, ForestInferenceRegressor
-from nvforest._handle import DeviceResources
+from nvforest._handle import Handle
 from nvforest._typing import DataType
 from nvforest.detail.forest_inference import ForestInferenceImpl
 
@@ -123,7 +123,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -310,7 +310,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -334,7 +334,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -425,7 +425,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -449,7 +449,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -532,7 +532,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -557,7 +557,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
@@ -649,7 +649,7 @@ def __init__(
         self,
         *,
         treelite_model: treelite.Model,
-        handle: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -674,7 +674,7 @@ def _create_with_layout(
         cls,
         *,
         treelite_model_bytes: bytes,
-        handle: Optional[DeviceResources],
+        handle: Optional[Handle],
         layout: str,
         default_chunk_size: Optional[int],
         align_bytes: Optional[int],
diff --git a/python/nvforest/nvforest/_handle.py b/python/nvforest/nvforest/_handle.py
index 878c48a..347b748 100644
--- a/python/nvforest/nvforest/_handle.py
+++ b/python/nvforest/nvforest/_handle.py
@@ -3,22 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-import warnings
+from pylibraft.common.handle import Handle as RaftHandle
 
-from pylibraft.common.handle import DeviceResources as RaftDeviceResources
-
-DeviceResources = RaftDeviceResources
-
-__all__ = ["DeviceResources", "Handle"]
-
-
-def __getattr__(name):
-    if name == "Handle":
-        warnings.warn(
-            "nvforest.Handle was renamed to nvforest.DeviceResources in 26.06 "
-            "and will be removed in 26.08.",
-            FutureWarning,
-            stacklevel=2,
-        )
-        return DeviceResources
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+# For now, nvforest.handle.Handle is an alias of pylibraft.common.handle.Handle
+Handle = RaftHandle
diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx
index 3a513b6..0340c59 100644
--- a/python/nvforest/nvforest/detail/forest_inference.pyx
+++ b/python/nvforest/nvforest/detail/forest_inference.pyx
@@ -8,14 +8,13 @@ from typing import Optional, Union
 import numpy as np
 import treelite
 
-from nvforest._handle import DeviceResources
+from nvforest._handle import Handle
 from nvforest._typing import DataType
 from nvforest.detail.treelite import safe_treelite_call
 
-from cython.operator cimport dereference as deref
 from libc.stdint cimport uint32_t, uintptr_t
 from libcpp cimport bool
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from pylibraft.common.handle cimport handle_t as raft_handle_t
 
 from nvforest.detail.infer_kind cimport infer_kind
 from nvforest.detail.postprocessing cimport element_op, row_op
@@ -25,6 +24,7 @@ from nvforest.detail.raft_proto.cuda_stream cimport (
 from nvforest.detail.raft_proto.device_type cimport (
     device_type as raft_proto_device_t,
 )
+from nvforest.detail.raft_proto.handle cimport handle_t as raft_proto_handle_t
 from nvforest.detail.raft_proto.optional cimport nullopt, optional
 from nvforest.detail.tree_layout cimport tree_layout as nvforest_tree_layout
 from nvforest.detail.treelite cimport (
@@ -34,18 +34,10 @@ from nvforest.detail.treelite cimport (
 )
 
 
-cdef extern from "raft/core/device_resources.hpp" namespace "raft" nogil:
-    cdef cppclass device_resources:
-        device_resources() except +
-        cuda_stream_view get_next_usable_stream() except +
-        void sync_stream() except +
-        void sync_stream_pool() except +
-
-
 cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil:
     cdef cppclass forest_model:
         void predict[io_t](
-            const device_resources&,
+            const raft_proto_handle_t&,
             io_t*,
             io_t*,
             size_t,
@@ -77,13 +69,13 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil:
 
 cdef class ForestInference_impl():
     cdef forest_model model
-    cdef object py_handle
-    cdef device_resources* c_handle
+    cdef raft_proto_handle_t raft_proto_handle
+    cdef object raft_handle
     cdef object device
 
     def __cinit__(
         self,
-        handle: object,
+        raft_handle: object,
         tl_model_bytes: Union[bytes, bytearray],
         *,
         layout: str = "depth_first",
@@ -92,8 +84,12 @@ cdef class ForestInference_impl():
         device: str = "cpu",
         device_id: Optional[int] = None,
     ):
-        self.py_handle = handle
-        self.c_handle = <device_resources*><size_t>self.py_handle.getHandle()
+        # Store reference to RAFT handle to control lifetime, since raft_proto
+        # handle keeps a pointer to it
+        self.raft_handle = raft_handle
+        self.raft_proto_handle = raft_proto_handle_t(
+            <raft_handle_t*><size_t>self.raft_handle.getHandle()
+        )
 
         cdef optional[bool] use_double_precision_c
         cdef bool use_double_precision_bool
@@ -138,7 +134,7 @@ cdef class ForestInference_impl():
             use_double_precision_c,
             dev_type,
             device_id,
-            <raft_proto_stream_t> self.c_handle.get_next_usable_stream().value()
+            self.raft_proto_handle.get_next_usable_stream()
         )
 
         safe_treelite_call(
@@ -248,7 +244,7 @@ cdef class ForestInference_impl():
 
         if model_dtype == np.float32:
             self.model.predict[float](
-                deref(self.c_handle),
+                self.raft_proto_handle,
                 <float *> out_ptr,
                 <float *> in_ptr,
                 n_rows,
@@ -259,7 +255,7 @@ cdef class ForestInference_impl():
             )
         else:
             self.model.predict[double](
-                deref(self.c_handle),
+                self.raft_proto_handle,
                 <double *> out_ptr,
                 <double *> in_ptr,
                 n_rows,
@@ -270,8 +266,7 @@ cdef class ForestInference_impl():
             )
 
         if self.device == "gpu":
-            self.c_handle.sync_stream_pool()
-            self.c_handle.sync_stream()
+            self.raft_proto_handle.synchronize()
         return preds
 
 
@@ -282,7 +277,7 @@ class ForestInferenceImpl:
         treelite_model: treelite.Model,
         device: str,
         device_id: int,
-        handle: Optional[DeviceResources] = None,
+        handle: Optional[Handle] = None,
         layout: str = "depth_first",
         default_chunk_size: Optional[int] = None,
         align_bytes: Optional[int] = None,
@@ -290,7 +285,7 @@ class ForestInferenceImpl:
     ):
         # Assumption: The caller needs to pass in correct (device, device_id) pair
         # This function will not contain any logic for auto-detecting device.
-        self.handle = DeviceResources() if handle is None else handle
+        self.handle = Handle() if handle is None else handle
         self._layout = layout
         self.precision = precision
         self.default_chunk_size = default_chunk_size
diff --git a/python/nvforest/nvforest/detail/raft_proto/handle.pxd b/python/nvforest/nvforest/detail/raft_proto/handle.pxd
new file mode 100644
index 0000000..5ad107e
--- /dev/null
+++ b/python/nvforest/nvforest/detail/raft_proto/handle.pxd
@@ -0,0 +1,19 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+from pylibraft.common.handle cimport handle_t as raft_handle_t
+
+from nvforest.detail.raft_proto.cuda_stream cimport (
+    cuda_stream as raft_proto_stream_t,
+)
+
+
+cdef extern from "nvforest/detail/raft_proto/handle.hpp" namespace "raft_proto" nogil:
+    cdef cppclass handle_t:
+        handle_t() except +
+        handle_t(const raft_handle_t* handle_ptr) except +
+        handle_t(const raft_handle_t& handle) except +
+        raft_proto_stream_t get_next_usable_stream() except +
+        void synchronize() except+

From 0b2ac1a8793b37693518a5cd921262541a54e700 Mon Sep 17 00:00:00 2001
From: Simon Adorf <sadorf@nvidia.com>
Date: Tue, 19 May 2026 16:15:29 +0000
Subject: [PATCH 34/35] Document auto-instantiated inference path

---
 cpp/include/nvforest/README.md  | 33 +++++++++++++++++++++++++--------
 docs/source/getting_started.rst | 17 +++++++++++++----
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md
index 3e3df17..a1f83e2 100644
--- a/cpp/include/nvforest/README.md
+++ b/cpp/include/nvforest/README.md
@@ -106,10 +106,7 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float));
 
 // Assuming that input is a float* pointing to data already located on-device
 
-auto handle = raft_proto::handle_t{};
-
 nvforest_model.predict(
-  handle,
   output,
   input,
   num_rows,
@@ -119,11 +116,31 @@ nvforest_model.predict(
 );
 ```
 
-**handle**: To provide a unified interface on CPU and GPU, we introduce
-`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a
-placeholder in CPU-only builds, and using it does not require any CUDA
-functionality. For GPU-enabled builds, you can construct a
-`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use.
+This is the primary C++ inference path. nvForest creates the RAFT handle it
+needs internally and synchronizes before returning.
+
+Applications that already manage RAFT handles can pass one explicitly:
+
+```cpp
+auto raft_handle = raft::handle_t{};
+auto handle = raft_proto::handle_t{raft_handle};
+
+nvforest_model.predict(
+  handle,
+  output,
+  input,
+  num_rows,
+  raft_proto::device_type::gpu,
+  raft_proto::device_type::gpu,
+  4
+);
+```
+
+**handle**: The explicit-handle overload accepts `raft_proto::handle_t`, a
+wrapper for `raft::handle_t`. This is currently just a placeholder in CPU-only
+builds, and using it does not require any CUDA functionality. For GPU-enabled
+builds, construct a `raft_proto::handle_t` directly from the `raft::handle_t`
+you wish to use.
 
 **output**: Pointer to pre-allocated buffer where results should be
 written. If the model has been loaded at single precision, this should be a
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 8500861..bbe2710 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -193,6 +193,19 @@ Once the tree model is available as a Treelite object, pass it to the
 
 Now that the tree model is fully imported into nvForest, let's run inference:
 
+.. code-block:: cpp
+
+    // Assumption:
+    // * Both output and input are in the GPU memory.
+    // * The input buffer should be of dimension (num_rows, num_features)
+    // * The output buffer should be of dimension (num_rows, fm.num_outputs())
+    fm.predict(output, input, num_rows,
+               raft_proto::device_type::gpu, raft_proto::device_type::gpu,
+               nvforest::infer_kind::default_kind);
+
+Applications that want more control over handle ownership, stream reuse, or
+synchronization can pass a RAFT handle explicitly like this:
+
 .. code-block:: cpp
 
     #include <raft/core/handle.hpp>
@@ -201,10 +214,6 @@ Now that the tree model is fully imported into nvForest, let's run inference:
     raft::handle_t raft_handle{};
     raft_proto::handle_t handle{raft_handle};
 
-    // Assumption:
-    // * Both output and input are in the GPU memory.
-    // * The input buffer should be of dimension (num_rows, num_features)
-    // * The output buffer should be of dimension (num_rows, fm.num_outputs())
     fm.predict(handle, output, input, num_rows,
                raft_proto::device_type::gpu, raft_proto::device_type::gpu,
                nvforest::infer_kind::default_kind);

From b2f1d32e0311f1c85e50a33bf22f4451132a2516 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <phcho@nvidia.com>
Date: Wed, 20 May 2026 19:58:46 -0700
Subject: [PATCH 35/35] Add a full docstring for the new predict()

---
 cpp/include/nvforest/forest_model.hpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 12c4738..e4256c1 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -311,6 +311,27 @@ struct forest_model {
   /**
    * Perform inference on given input using an internally managed RAFT handle.
    * This function is blocking and synchronizes the handle before returning.
+   *
+   * @param[out] output Pointer to the memory location where output should end
+   * up
+   * @param[in] input Pointer to the input data
+   * @param[in] num_rows Number of rows in input
+   * @param[in] out_mem_type The memory type (device/host) of the output
+   * buffer
+   * @param[in] in_mem_type The memory type (device/host) of the input buffer
+   * @param[in] predict_type Type of inference to perform. Defaults to summing
+   * the outputs of all trees and produce an output per row. If set to
+   * "per_tree", we will instead output all outputs of individual trees.
+   * If set to "leaf_id", we will output the integer ID of the leaf node
+   * for each tree.
+   * @param[in] specified_chunk_size: Specifies the mini-batch size for
+   * processing. This has different meanings on CPU and GPU, but on GPU it
+   * corresponds to the number of rows evaluated per inference iteration
+   * on a single block. It can take on any power of 2 from 1 to 32, and
+   * runtime performance is quite sensitive to the value chosen. In general,
+   * larger batches benefit from higher values, but it is hard to predict the
+   * optimal value a priori. If omitted, a heuristic will be used to select a
+   * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
   void predict(io_t* output,