diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md index 3e3df17..a1f83e2 100644 --- a/cpp/include/nvforest/README.md +++ b/cpp/include/nvforest/README.md @@ -106,10 +106,7 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float)); // Assuming that input is a float* pointing to data already located on-device -auto handle = raft_proto::handle_t{}; - nvforest_model.predict( - handle, output, input, num_rows, @@ -119,11 +116,31 @@ nvforest_model.predict( ); ``` -**handle**: To provide a unified interface on CPU and GPU, we introduce -`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a -placeholder in CPU-only builds, and using it does not require any CUDA -functionality. For GPU-enabled builds, you can construct a -`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use. +This is the primary C++ inference path. nvForest creates the RAFT handle it +needs internally and synchronizes before returning. + +Applications that already manage RAFT handles can pass one explicitly: + +```cpp +auto raft_handle = raft::handle_t{}; +auto handle = raft_proto::handle_t{raft_handle}; + +nvforest_model.predict( + handle, + output, + input, + num_rows, + raft_proto::device_type::gpu, + raft_proto::device_type::gpu, + 4 +); +``` + +**handle**: The explicit-handle overload accepts `raft_proto::handle_t`, a +wrapper for `raft::handle_t`. This is currently just a placeholder in CPU-only +builds, and using it does not require any CUDA functionality. For GPU-enabled +builds, construct a `raft_proto::handle_t` directly from the `raft::handle_t` +you wish to use. **output**: Pointer to pre-allocated buffer where results should be written. If the model has been loaded at single precision, this should be a diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 9c07f13..e4256c1 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -308,6 +308,57 @@ struct forest_model { predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size); } + /** + * Perform inference on given input using an internally managed RAFT handle. + * This function is blocking and synchronizes the handle before returning. + * + * @param[out] output Pointer to the memory location where output should end + * up + * @param[in] input Pointer to the input data + * @param[in] num_rows Number of rows in input + * @param[in] out_mem_type The memory type (device/host) of the output + * buffer + * @param[in] in_mem_type The memory type (device/host) of the input buffer + * @param[in] predict_type Type of inference to perform. Defaults to summing + * the outputs of all trees and produce an output per row. If set to + * "per_tree", we will instead output all outputs of individual trees. + * If set to "leaf_id", we will output the integer ID of the leaf node + * for each tree. + * @param[in] specified_chunk_size: Specifies the mini-batch size for + * processing. This has different meanings on CPU and GPU, but on GPU it + * corresponds to the number of rows evaluated per inference iteration + * on a single block. It can take on any power of 2 from 1 to 32, and + * runtime performance is quite sensitive to the value chosen. In general, + * larger batches benefit from higher values, but it is hard to predict the + * optimal value a priori. If omitted, a heuristic will be used to select a + * reasonable value. On CPU, this argument can generally just be omitted. + */ + template + void predict(io_t* output, + io_t* input, + std::size_t num_rows, + raft_proto::device_type out_mem_type, + raft_proto::device_type in_mem_type, + infer_kind predict_type = infer_kind::default_kind, + std::optional specified_chunk_size = std::nullopt) + { +#ifdef NVFOREST_ENABLE_GPU + auto raft_handle = raft::handle_t{}; + auto handle = raft_proto::handle_t{raft_handle}; +#else + auto handle = raft_proto::handle_t{}; +#endif + predict(handle, + output, + input, + num_rows, + out_mem_type, + in_mem_type, + predict_type, + specified_chunk_size); + handle.synchronize(); + } + private: decision_forest_variant decision_forest_; }; diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp index 528aebb..f3b7624 100644 --- a/cpp/tests/treelite_importer.cpp +++ b/cpp/tests/treelite_importer.cpp @@ -408,17 +408,10 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) auto nvforest_model = import_from_treelite_model(*tl_model, tree_layout::breadth_first); ASSERT_TRUE(nvforest_model.has_vector_leaves()); -#ifdef NVFOREST_ENABLE_GPU - auto raft_handle = raft::handle_t{}; - auto handle = raft_proto::handle_t{raft_handle}; -#else - auto handle = raft_proto::handle_t{}; -#endif auto X = std::vector{0.0}; auto preds = std::vector(2, 0.0); auto expected_preds = std::vector{0.5, 0.5}; - nvforest_model.predict(handle, - preds.data(), + nvforest_model.predict(preds.data(), X.data(), 1, raft_proto::device_type::cpu, diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 8500861..bbe2710 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -193,6 +193,19 @@ Once the tree model is available as a Treelite object, pass it to the Now that the tree model is fully imported into nvForest, let's run inference: +.. code-block:: cpp + + // Assumption: + // * Both output and input are in the GPU memory. + // * The input buffer should be of dimension (num_rows, num_features) + // * The output buffer should be of dimension (num_rows, fm.num_outputs()) + fm.predict(output, input, num_rows, + raft_proto::device_type::gpu, raft_proto::device_type::gpu, + nvforest::infer_kind::default_kind); + +Applications that want more control over handle ownership, stream reuse, or +synchronization can pass a RAFT handle explicitly like this: + .. code-block:: cpp #include @@ -201,10 +214,6 @@ Now that the tree model is fully imported into nvForest, let's run inference: raft::handle_t raft_handle{}; raft_proto::handle_t handle{raft_handle}; - // Assumption: - // * Both output and input are in the GPU memory. - // * The input buffer should be of dimension (num_rows, num_features) - // * The output buffer should be of dimension (num_rows, fm.num_outputs()) fm.predict(handle, output, input, num_rows, raft_proto::device_type::gpu, raft_proto::device_type::gpu, nvforest::infer_kind::default_kind);