rapidsai · hcho3 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
@@ -106,10 +106,7 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float));
 
 // Assuming that input is a float* pointing to data already located on-device
 
-auto handle = raft_proto::handle_t{};
-
 nvforest_model.predict(
-  handle,
   output,
   input,
   num_rows,
@@ -119,11 +116,31 @@ nvforest_model.predict(
 );
 ```
 
-**handle**: To provide a unified interface on CPU and GPU, we introduce
-`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a
-placeholder in CPU-only builds, and using it does not require any CUDA
-functionality. For GPU-enabled builds, you can construct a
-`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use.
+This is the primary C++ inference path. nvForest creates the RAFT handle it
+needs internally and synchronizes before returning.
+
+Applications that already manage RAFT handles can pass one explicitly:
+
+```cpp
+auto raft_handle = raft::handle_t{};
+auto handle = raft_proto::handle_t{raft_handle};
+
+nvforest_model.predict(
+  handle,
+  output,
+  input,
+  num_rows,
+  raft_proto::device_type::gpu,
+  raft_proto::device_type::gpu,
+  4
+);
+```
+
+**handle**: The explicit-handle overload accepts `raft_proto::handle_t`, a
+wrapper for `raft::handle_t`. This is currently just a placeholder in CPU-only
+builds, and using it does not require any CUDA functionality. For GPU-enabled
+builds, construct a `raft_proto::handle_t` directly from the `raft::handle_t`
+you wish to use.
 
 **output**: Pointer to pre-allocated buffer where results should be
 written. If the model has been loaded at single precision, this should be a

@@ -308,6 +308,57 @@ struct forest_model {
     predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size);
   }
 
+  /**
+   * Perform inference on given input using an internally managed RAFT handle.
+   * This function is blocking and synchronizes the handle before returning.
+   *
+   * @param[out] output Pointer to the memory location where output should end
+   * up
+   * @param[in] input Pointer to the input data
+   * @param[in] num_rows Number of rows in input
+   * @param[in] out_mem_type The memory type (device/host) of the output
+   * buffer
+   * @param[in] in_mem_type The memory type (device/host) of the input buffer
+   * @param[in] predict_type Type of inference to perform. Defaults to summing
+   * the outputs of all trees and produce an output per row. If set to
+   * "per_tree", we will instead output all outputs of individual trees.
+   * If set to "leaf_id", we will output the integer ID of the leaf node
+   * for each tree.
+   * @param[in] specified_chunk_size: Specifies the mini-batch size for
+   * processing. This has different meanings on CPU and GPU, but on GPU it
+   * corresponds to the number of rows evaluated per inference iteration
+   * on a single block. It can take on any power of 2 from 1 to 32, and
+   * runtime performance is quite sensitive to the value chosen. In general,
+   * larger batches benefit from higher values, but it is hard to predict the
+   * optimal value a priori. If omitted, a heuristic will be used to select a
+   * reasonable value. On CPU, this argument can generally just be omitted.
+   */
+  template <typename io_t>
+  void predict(io_t* output,
+               io_t* input,
+               std::size_t num_rows,
+               raft_proto::device_type out_mem_type,
+               raft_proto::device_type in_mem_type,
+               infer_kind predict_type                        = infer_kind::default_kind,
+               std::optional<index_type> specified_chunk_size = std::nullopt)
+  {
+#ifdef NVFOREST_ENABLE_GPU
+    auto raft_handle = raft::handle_t{};
+    auto handle      = raft_proto::handle_t{raft_handle};
+#else
+    auto handle = raft_proto::handle_t{};
+#endif
+    predict(handle,
+            output,
+            input,
+            num_rows,
+            out_mem_type,
+            in_mem_type,
+            predict_type,
+            specified_chunk_size);
+    handle.synchronize();
+  }
+
  private:
   decision_forest_variant decision_forest_;
 };

@@ -408,17 +408,10 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
   auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first);
   ASSERT_TRUE(nvforest_model.has_vector_leaves());
 
-#ifdef NVFOREST_ENABLE_GPU
-  auto raft_handle = raft::handle_t{};
-  auto handle      = raft_proto::handle_t{raft_handle};
-#else
-  auto handle = raft_proto::handle_t{};
-#endif
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(2, 0.0);
   auto expected_preds = std::vector<double>{0.5, 0.5};
-  nvforest_model.predict(handle,
-                         preds.data(),
+  nvforest_model.predict(preds.data(),
                          X.data(),
                          1,
                          raft_proto::device_type::cpu,

@@ -193,6 +193,19 @@ Once the tree model is available as a Treelite object, pass it to the
 
 Now that the tree model is fully imported into nvForest, let's run inference:
 
+.. code-block:: cpp
+
+    // Assumption:
+    // * Both output and input are in the GPU memory.
+    // * The input buffer should be of dimension (num_rows, num_features)
+    // * The output buffer should be of dimension (num_rows, fm.num_outputs())
+    fm.predict(output, input, num_rows,
+               raft_proto::device_type::gpu, raft_proto::device_type::gpu,
+               nvforest::infer_kind::default_kind);
+
+Applications that want more control over handle ownership, stream reuse, or
+synchronization can pass a RAFT handle explicitly like this:
+
 .. code-block:: cpp
 
     #include <raft/core/handle.hpp>
@@ -201,10 +214,6 @@ Now that the tree model is fully imported into nvForest, let's run inference:
     raft::handle_t raft_handle{};
     raft_proto::handle_t handle{raft_handle};
 
-    // Assumption:
-    // * Both output and input are in the GPU memory.
-    // * The input buffer should be of dimension (num_rows, num_features)
-    // * The output buffer should be of dimension (num_rows, fm.num_outputs())
     fm.predict(handle, output, input, num_rows,
                raft_proto::device_type::gpu, raft_proto::device_type::gpu,
                nvforest::infer_kind::default_kind);