diff --git a/build.sh b/build.sh index 541c784..6497744 100755 --- a/build.sh +++ b/build.sh @@ -291,9 +291,9 @@ if (! hasArg --configure-only) && (completeBuild || hasArg libnvforest); then fi fi MSG="${MSG}
parallel setting: $PARALLEL_LEVEL" - if [[ -f "${LIBNVFOREST_BUILD_DIR}/libnvforest++.so" ]]; then - LIBNVFOREST_FS=$(find "${LIBNVFOREST_BUILD_DIR}" -name libnvforest++.so -printf '%s'| awk '{printf "%.2f MB", $1/1024/1024}') - MSG="${MSG}
libnvforest++.so size: $LIBNVFOREST_FS" + if [[ -f "${LIBNVFOREST_BUILD_DIR}/libnvforest.so" ]]; then + LIBNVFOREST_FS=$(find "${LIBNVFOREST_BUILD_DIR}" -name libnvforest.so -printf '%s'| awk '{printf "%.2f MB", $1/1024/1024}') + MSG="${MSG}
libnvforest.so size: $LIBNVFOREST_FS" fi BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIBNVFOREST_BUILD_DIR}"} echo "The HTML report can be found at [${BMR_DIR}/ninja_log.html]. In CI, this report" diff --git a/ci/build_wheel_nvforest.sh b/ci/build_wheel_nvforest.sh index 7960ce2..ce96a70 100755 --- a/ci/build_wheel_nvforest.sh +++ b/ci/build_wheel_nvforest.sh @@ -20,7 +20,7 @@ LIBNVFOREST_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libnvforest_${RAPIDS_PY_CUDA_SUFF echo "libnvforest-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo "${LIBNVFOREST_WHEELHOUSE}"/libnvforest_*.whl)" >> "${PIP_CONSTRAINT}" EXCLUDE_ARGS=( - --exclude "libnvforest++.so" + --exclude "libnvforest.so" --exclude "libraft.so" --exclude "libcublas.so.*" --exclude "libcublasLt.so.*" diff --git a/conda/recipes/libnvforest/recipe.yaml b/conda/recipes/libnvforest/recipe.yaml index 3495392..26ca5b9 100644 --- a/conda/recipes/libnvforest/recipe.yaml +++ b/conda/recipes/libnvforest/recipe.yaml @@ -91,7 +91,7 @@ outputs: prefix_detection: ignore: # See https://github.com/rapidsai/build-planning/issues/160 - - lib/libnvforest++.so + - lib/libnvforest.so string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }} requirements: build: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fb9d015..8dbaa15 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -60,7 +60,7 @@ option(NVTX "Enable nvtx markers" OFF) option(USE_CCACHE "Cache build artifacts with ccache" OFF) option(NVFOREST_USE_RAFT_STATIC "Build and statically link the RAFT library" OFF) option(NVFOREST_USE_TREELITE_STATIC "Build and statically link the treelite library" OFF) -option(NVFOREST_EXPORT_TREELITE_LINKAGE "Whether to publicly or privately link treelite to libnvforest++" OFF) +option(NVFOREST_EXPORT_TREELITE_LINKAGE "Whether to publicly or privately link treelite to libnvforest" OFF) option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON) # The options below allow incorporating libnvforest into another build process without installing all its components. @@ -123,7 +123,7 @@ endif() # ###################################################################################################################### # * Target names ------------------------------------------------------------- -set(NVFOREST_CPP_TARGET "nvforest++") +set(NVFOREST_CPP_TARGET "nvforest") # ###################################################################################################################### # * Conda environment detection ---------------------------------------------- @@ -193,7 +193,7 @@ if(BUILD_NVFOREST_TESTS) endif() # ###################################################################################################################### -# * build libnvforest++ shared library ------------------------------------------- +# * build libnvforest shared library ------------------------------------------- file( WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld" diff --git a/cpp/include/nvforest/Implementation.md b/cpp/include/nvforest/Implementation.md index f34224b..b0bc595 100644 --- a/cpp/include/nvforest/Implementation.md +++ b/cpp/include/nvforest/Implementation.md @@ -6,17 +6,6 @@ does *not* require nvcc, CUDA or any other GPU-related library for its CPU-only build, we also go over general strategies for CPU/GPU interoperability as used by nvForest. -**A NOTE ON THE `raft_proto` NAMESPACE:** In addition to nvForest-specific code, the new -implementation requires some more general-purpose CPU-GPU interoperable -utilities. Many of these utilities are either already implemented in RAFT (but -do not provide the required CPU-interoperable compilation guarantees) or are a -natural fit for incorporation in RAFT. In order to allow for more careful -integration with the existing RAFT codebase and interoperability -strategies, these utilities are currently provided in the `raft_proto` -namespace but will be moved into RAFT over time. Other algorithms should -not make use of the `raft_proto` namespace but instead wait until this -transition has taken place. - ## Design Goals 1. Provide state-of-the-art runtime performance for forest models on GPU, especially for cases where CPU performance will not suffice (e.g. large @@ -43,7 +32,7 @@ codebase. It is also occasionally useful to make use of a `constexpr` value indicating whether or not `NVFOREST_ENABLE_GPU` is set, which we introduce as -`raft_proto::GPU_ENABLED`. +`nvforest::detail::GPU_ENABLED`. ### Avoiding CUDA symbols in CPU-only builds The most significant challenge of attempting to create a unified CPU/GPU @@ -88,7 +77,7 @@ between GPU and CPU. Where we _need_ to provide distinct logic between GPU and CPU implementations, we do so in implementation headers. In `infer/cpu.hpp`, we have a fully-defined template for CPU specializations of -`detail::inference::infer`. If `raft_proto::GPU_ENABLED` is `false`, we also +`detail::inference::infer`. If `nvforest::detail::GPU_ENABLED` is `false`, we also include the GPU specializations, which will simply throw an exception if invoked. In `infer/gpu.hpp` we *declare* but do not *define* the GPU specializations. In `infer/gpu.cuh` we provide the full working definition for @@ -158,8 +147,8 @@ a standard benchmark) on the CPU. With some motivation for the general approach to CPU-GPU interoperability, we now offer an overview of the layout of the codebase to help guide future -improvements. Because `raft_proto` utilities are going to be moved to RAFT or other -general-purpose libraries, we will not review anything within the `raft_proto` +improvements. Because `nvforest::detail` utilities are going to be moved to RAFT or other +general-purpose libraries, we will not review anything within the `nvforest::detail` directory here. ### Public Headers diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md index 3e3df17..711ab33 100644 --- a/cpp/include/nvforest/README.md +++ b/cpp/include/nvforest/README.md @@ -19,13 +19,6 @@ available in the top-level include directory. The `detail` directory contains implementation details that are not required to use nvForest and which will certainly change over time. -**A NOTE ON THE `raft_proto` NAMESPACE:** For the first iteration of this nvForest -implementation, much of the more general-purpose CPU-GPU interoperable code -has temporarily been put in the `raft_proto` namespace. As the name suggests, -the intention is that most or all of this functionality will either be moved -to RAFT or that RAFT features will be updated to provide CPU-GPU -compatible versions of the same. - ### Importing a model nvForest uses Treelite as a common translation layer for all its input types. To load a forest model, we first create a Treelite model handle as @@ -50,7 +43,7 @@ auto nvforest_model = import_from_treelite_model( tree_layout::depth_first, // layout 128u, // align_bytes false, // use_double_precision - raft_proto::device_type::gpu, // mem_type + nvforest::device_type::gpu, // mem_type 0, // device_id stream // CUDA stream ); @@ -74,7 +67,7 @@ serialization format will be used. Otherwise, the model will be evaluated at double precision if this value is set to `true` or single precision if this value is set to `false`. -**dev_type**: This argument controls where the model will be executed. If `raft_proto::device_type::gpu`, then it will be executed on GPU. If `raft_proto::device_type::cpu`, then it will be executed on CPU. +**dev_type**: This argument controls where the model will be executed. If `nvforest::device_type::gpu`, then it will be executed on GPU. If `nvforest::device_type::cpu`, then it will be executed on CPU. **device_id**: This integer indicates the ID of the GPU which should be used. If CPU is being used, this argument is ignored. @@ -82,9 +75,9 @@ If CPU is being used, this argument is ignored. **stream**: The CUDA stream which will be used for the actual model import. If CPU is being used, this argument is ignored. Note that you do *not* need CUDA headers if you are working with a CPU-only build of nvForest. This -argument uses a `raft_proto::cuda_stream` type which evaluates to a +argument uses a `nvforest::cuda_stream` type which evaluates to a placeholder type in CPU-only builds. For applications which themselves want to -implement CPU-GPU interoperable builds, the `raft_proto::cuda_stream` type can be +implement CPU-GPU interoperable builds, the `nvforest::cuda_stream` type can be used directly. @@ -106,24 +99,24 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float)); // Assuming that input is a float* pointing to data already located on-device -auto handle = raft_proto::handle_t{}; +auto handle = nvforest::handle_t{}; nvforest_model.predict( handle, output, input, num_rows, - raft_proto::device_type::gpu, // out_mem_type - raft_proto::device_type::gpu, // in_mem_type + nvforest::device_type::gpu, // out_mem_type + nvforest::device_type::gpu, // in_mem_type 4 // chunk_size ); ``` **handle**: To provide a unified interface on CPU and GPU, we introduce -`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a +`nvforest::handle_t` as a wrapper for `raft::handle_t`. This is currently just a placeholder in CPU-only builds, and using it does not require any CUDA functionality. For GPU-enabled builds, you can construct a -`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use. +`nvforest::handle_t` directly from the `raft::handle_t` you wish to use. **output**: Pointer to pre-allocated buffer where results should be written. If the model has been loaded at single precision, this should be a diff --git a/cpp/include/nvforest/detail/raft_proto/buffer.hpp b/cpp/include/nvforest/buffer.hpp similarity index 69% rename from cpp/include/nvforest/detail/raft_proto/buffer.hpp rename to cpp/include/nvforest/buffer.hpp index 7490e48..deebf60 100644 --- a/cpp/include/nvforest/detail/raft_proto/buffer.hpp +++ b/cpp/include/nvforest/buffer.hpp @@ -3,15 +3,15 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include @@ -21,7 +21,8 @@ #include #include -namespace raft_proto { +namespace nvforest { + /** * @brief A container which may or may not own its own data on host or device * @@ -31,10 +32,10 @@ struct buffer { using index_type = std::size_t; using value_type = T; - using data_store = std::variant, - non_owning_buffer, - owning_buffer, - owning_buffer>; + using data_store = std::variant, + detail::non_owning_buffer, + detail::owning_buffer, + detail::owning_buffer>; buffer() : device_{}, data_{}, size_{}, cached_ptr{nullptr} {} @@ -44,19 +45,19 @@ struct buffer { int device = 0, cuda_stream stream = 0) : device_{[mem_type, &device]() { - auto result = device_id_variant{}; + auto result = detail::device_id_variant{}; switch (mem_type) { - case device_type::cpu: result = device_id{device}; break; - case device_type::gpu: result = device_id{device}; break; + case device_type::cpu: result = detail::device_id{device}; break; + case device_type::gpu: result = detail::device_id{device}; break; } return result; }()}, data_{[this, mem_type, size, stream]() { auto result = data_store{}; switch (mem_type) { - case device_type::cpu: result = owning_buffer{size}; break; + case device_type::cpu: result = detail::owning_buffer{size}; break; case device_type::gpu: - result = owning_buffer{std::get<1>(device_), size, stream}; + result = detail::owning_buffer{std::get<1>(device_), size, stream}; break; } return result; @@ -78,18 +79,22 @@ struct buffer { /** Construct non-owning buffer */ buffer(T* input_data, index_type size, device_type mem_type = device_type::cpu, int device = 0) : device_{[mem_type, &device]() { - auto result = device_id_variant{}; + auto result = detail::device_id_variant{}; switch (mem_type) { - case device_type::cpu: result = device_id{device}; break; - case device_type::gpu: result = device_id{device}; break; + case device_type::cpu: result = detail::device_id{device}; break; + case device_type::gpu: result = detail::device_id{device}; break; } return result; }()}, data_{[input_data, mem_type]() { auto result = data_store{}; switch (mem_type) { - case device_type::cpu: result = non_owning_buffer{input_data}; break; - case device_type::gpu: result = non_owning_buffer{input_data}; break; + case device_type::cpu: + result = detail::non_owning_buffer{input_data}; + break; + case device_type::gpu: + result = detail::non_owning_buffer{input_data}; + break; } return result; }()}, @@ -118,10 +123,10 @@ struct buffer { int device = 0, cuda_stream stream = cuda_stream{}) : device_{[mem_type, &device]() { - auto result = device_id_variant{}; + auto result = detail::device_id_variant{}; switch (mem_type) { - case device_type::cpu: result = device_id{device}; break; - case device_type::gpu: result = device_id{device}; break; + case device_type::cpu: result = detail::device_id{device}; break; + case device_type::gpu: result = detail::device_id{device}; break; } return result; }()}, @@ -129,11 +134,12 @@ struct buffer { auto result = data_store{}; auto result_data = static_cast(nullptr); if (mem_type == device_type::cpu) { - auto buf = owning_buffer(other.size()); + auto buf = detail::owning_buffer(other.size()); result_data = buf.get(); result = std::move(buf); } else if (mem_type == device_type::gpu) { - auto buf = owning_buffer(std::get<1>(device_), other.size(), stream); + auto buf = + detail::owning_buffer(std::get<1>(device_), other.size(), stream); result_data = buf.get(); result = std::move(buf); } @@ -188,10 +194,10 @@ struct buffer { */ buffer(buffer&& other, device_type mem_type, int device, cuda_stream stream) : device_{[mem_type, &device]() { - auto result = device_id_variant{}; + auto result = detail::device_id_variant{}; switch (mem_type) { - case device_type::cpu: result = device_id{device}; break; - case device_type::gpu: result = device_id{device}; break; + case device_type::cpu: result = detail::device_id{device}; break; + case device_type::gpu: result = detail::device_id{device}; break; } return result; }()}, @@ -202,11 +208,11 @@ struct buffer { } else { auto* result_data = static_cast(nullptr); if (mem_type == device_type::cpu) { - auto buf = owning_buffer{other.size()}; + auto buf = detail::owning_buffer{other.size()}; result_data = buf.get(); result = std::move(buf); } else if (mem_type == device_type::gpu) { - auto buf = owning_buffer{device, other.size(), stream}; + auto buf = detail::owning_buffer{device, other.size(), stream}; result_data = buf.get(); result = std::move(buf); } @@ -306,23 +312,23 @@ struct buffer { ~buffer() = default; private: - device_id_variant device_; + detail::device_id_variant device_; data_store data_; index_type size_; T* cached_ptr; }; template -const_agnostic_same_t copy(buffer& dst, - buffer const& src, - typename buffer::index_type dst_offset, - typename buffer::index_type src_offset, - typename buffer::index_type size, - cuda_stream stream) +detail::const_agnostic_same_t copy(buffer& dst, + buffer const& src, + typename buffer::index_type dst_offset, + typename buffer::index_type src_offset, + typename buffer::index_type size, + cuda_stream stream) { if constexpr (bounds_check) { if (src.size() - src_offset < size || dst.size() - dst_offset < size) { - throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); + throw detail::out_of_bounds("Attempted copy to or from buffer of inadequate size"); } } copy(dst.data() + dst_offset, @@ -334,27 +340,27 @@ const_agnostic_same_t copy(buffer& dst, } template -const_agnostic_same_t copy(buffer& dst, buffer const& src, cuda_stream stream) +detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, cuda_stream stream) { copy(dst, src, 0, 0, src.size(), stream); } template -const_agnostic_same_t copy(buffer& dst, buffer const& src) +detail::const_agnostic_same_t copy(buffer& dst, buffer const& src) { copy(dst, src, 0, 0, src.size(), cuda_stream{}); } template -const_agnostic_same_t copy(buffer&& dst, - buffer&& src, - typename buffer::index_type dst_offset, - typename buffer::index_type src_offset, - typename buffer::index_type size, - cuda_stream stream) +detail::const_agnostic_same_t copy(buffer&& dst, + buffer&& src, + typename buffer::index_type dst_offset, + typename buffer::index_type src_offset, + typename buffer::index_type size, + cuda_stream stream) { if constexpr (bounds_check) { if (src.size() - src_offset < size || dst.size() - dst_offset < size) { - throw out_of_bounds("Attempted copy to or from buffer of inadequate size"); + throw detail::out_of_bounds("Attempted copy to or from buffer of inadequate size"); } } copy(dst.data() + dst_offset, @@ -366,23 +372,23 @@ const_agnostic_same_t copy(buffer&& dst, } template -const_agnostic_same_t copy(buffer&& dst, - buffer&& src, - typename buffer::index_type dst_offset, - cuda_stream stream) +detail::const_agnostic_same_t copy(buffer&& dst, + buffer&& src, + typename buffer::index_type dst_offset, + cuda_stream stream) { copy(dst, src, dst_offset, 0, src.size(), stream); } template -const_agnostic_same_t copy(buffer&& dst, buffer&& src, cuda_stream stream) +detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, cuda_stream stream) { copy(dst, src, 0, 0, src.size(), stream); } template -const_agnostic_same_t copy(buffer&& dst, buffer&& src) +detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src) { copy(dst, src, 0, 0, src.size(), cuda_stream{}); } -} // namespace raft_proto +} // namespace nvforest diff --git a/cpp/include/nvforest/detail/raft_proto/cuda_stream.hpp b/cpp/include/nvforest/cuda_stream.hpp similarity index 71% rename from cpp/include/nvforest/detail/raft_proto/cuda_stream.hpp rename to cpp/include/nvforest/cuda_stream.hpp index f80c488..d231698 100644 --- a/cpp/include/nvforest/detail/raft_proto/cuda_stream.hpp +++ b/cpp/include/nvforest/cuda_stream.hpp @@ -7,7 +7,7 @@ #include #endif -namespace raft_proto { +namespace nvforest { #ifdef NVFOREST_ENABLE_GPU using cuda_stream = cudaStream_t; #else @@ -19,4 +19,9 @@ inline void synchronize(cuda_stream stream) cudaStreamSynchronize(stream); #endif } -} // namespace raft_proto +} // namespace nvforest + +namespace nvforest::detail { +using nvforest::cuda_stream; +using nvforest::synchronize; +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/decision_forest.hpp b/cpp/include/nvforest/decision_forest.hpp index e0c5a9d..6534196 100644 --- a/cpp/include/nvforest/decision_forest.hpp +++ b/cpp/include/nvforest/decision_forest.hpp @@ -3,15 +3,16 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include #include +#include +#include #include +#include #include #include #include #include -#include -#include -#include #include #include #include @@ -145,21 +146,21 @@ struct decision_forest { * operations, including sigmoid, exponential, and * logarithm_one_plus_exp */ - decision_forest(raft_proto::buffer&& nodes, - raft_proto::buffer&& root_node_indexes, - raft_proto::buffer&& node_id_mapping, - raft_proto::buffer&& bias, - index_type num_features, - index_type num_outputs = index_type{2}, - bool has_categorical_nodes = false, - std::optional>&& vector_output = std::nullopt, - std::optional>&& - categorical_storage = std::nullopt, - index_type leaf_size = index_type{1}, - row_op row_postproc = row_op::disable, - element_op elem_postproc = element_op::disable, - io_type average_factor = io_type{1}, - io_type postproc_constant = io_type{1}) + decision_forest( + buffer&& nodes, + buffer&& root_node_indexes, + buffer&& node_id_mapping, + buffer&& bias, + index_type num_features, + index_type num_outputs = index_type{2}, + bool has_categorical_nodes = false, + std::optional>&& vector_output = std::nullopt, + std::optional>&& categorical_storage = std::nullopt, + index_type leaf_size = index_type{1}, + row_op row_postproc = row_op::disable, + element_op elem_postproc = element_op::disable, + io_type average_factor = io_type{1}, + io_type postproc_constant = io_type{1}) : nodes_{nodes}, root_node_indexes_{root_node_indexes}, node_id_mapping_{node_id_mapping}, @@ -176,11 +177,11 @@ struct decision_forest { postproc_constant_{postproc_constant} { if (nodes.memory_type() != root_node_indexes.memory_type()) { - throw raft_proto::mem_type_mismatch( + throw detail::mem_type_mismatch( "Nodes and indexes of forest must both be stored on either host or device"); } if (nodes.device_index() != root_node_indexes.device_index()) { - throw raft_proto::mem_type_mismatch( + throw detail::mem_type_mismatch( "Nodes and indexes of forest must both be stored on same device"); } detail::initialize_device(nodes.device()); @@ -245,18 +246,18 @@ struct decision_forest { * 1 to 32 is a valid value, and in general larger batches benefit from * larger values. */ - void predict(raft_proto::buffer& output, - raft_proto::buffer const& input, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}, + void predict(buffer& output, + buffer const& input, + nvforest::cuda_stream stream = nvforest::cuda_stream{}, infer_kind predict_type = infer_kind::default_kind, std::optional specified_rows_per_block_iter = std::nullopt) { if (output.memory_type() != memory_type() || input.memory_type() != memory_type()) { - throw raft_proto::wrong_device_type{ + throw detail::wrong_device_type{ "Tried to use host I/O data with model on device or vice versa"}; } if (output.device_index() != device_index() || input.device_index() != device_index()) { - throw raft_proto::wrong_device{"I/O data on different device than model"}; + throw detail::wrong_device{"I/O data on different device than model"}; } auto* vector_output_data = (vector_output_.has_value() ? vector_output_->data() : static_cast(nullptr)); @@ -265,54 +266,54 @@ struct decision_forest { : static_cast(nullptr)); switch (nodes_.device().index()) { case 0: - nvforest::detail::infer(obj(), - get_postprocessor(predict_type), - output.data(), - input.data(), - index_type(input.size() / num_features_), - num_features_, - num_outputs(predict_type), - has_categorical_nodes_, - vector_output_data, - categorical_storage_data, - predict_type, - specified_rows_per_block_iter, - std::get<0>(nodes_.device()), - stream); + detail::infer(obj(), + get_postprocessor(predict_type), + output.data(), + input.data(), + index_type(input.size() / num_features_), + num_features_, + num_outputs(predict_type), + has_categorical_nodes_, + vector_output_data, + categorical_storage_data, + predict_type, + specified_rows_per_block_iter, + std::get<0>(nodes_.device()), + stream); break; case 1: - nvforest::detail::infer(obj(), - get_postprocessor(predict_type), - output.data(), - input.data(), - index_type(input.size() / num_features_), - num_features_, - num_outputs(predict_type), - has_categorical_nodes_, - vector_output_data, - categorical_storage_data, - predict_type, - specified_rows_per_block_iter, - std::get<1>(nodes_.device()), - stream); + detail::infer(obj(), + get_postprocessor(predict_type), + output.data(), + input.data(), + index_type(input.size() / num_features_), + num_features_, + num_outputs(predict_type), + has_categorical_nodes_, + vector_output_data, + categorical_storage_data, + predict_type, + specified_rows_per_block_iter, + std::get<1>(nodes_.device()), + stream); break; } } private: /** The nodes for all trees in the forest */ - raft_proto::buffer nodes_; + buffer nodes_; /** The index of the root node for each tree in the forest */ - raft_proto::buffer root_node_indexes_; + buffer root_node_indexes_; /** Mapping to apply to node IDs. Only relevant when predict_type == infer_kind::leaf_id */ - raft_proto::buffer node_id_mapping_; + buffer node_id_mapping_; /** Bias term to apply to the output */ - raft_proto::buffer bias_; + buffer bias_; /** Buffer of outputs for all leaves in vector-leaf models */ - std::optional> vector_output_; + std::optional> vector_output_; /** Buffer of elements used as backing data for bitsets which specify * categories for all categorical nodes in the model. */ - std::optional> categorical_storage_; + std::optional> categorical_storage_; // Metadata index_type num_features_; @@ -458,7 +459,7 @@ inline auto get_forest_variant_index(bool use_double_thresholds, // TODO(wphicks): We are overestimating categorical storage required here auto double_indexes_required = (max_num_categories > max_local_categories && - ((raft_proto::ceildiv(max_num_categories, max_local_categories) + 1 * num_categorical_nodes) > + ((detail::ceildiv(max_num_categories, max_local_categories) + 1 * num_categorical_nodes) > std::numeric_limits::max())) || num_vector_leaves > std::numeric_limits::max(); diff --git a/cpp/include/nvforest/detail/bitset.hpp b/cpp/include/nvforest/detail/bitset.hpp index c11cdee..c166eb3 100644 --- a/cpp/include/nvforest/detail/bitset.hpp +++ b/cpp/include/nvforest/detail/bitset.hpp @@ -3,8 +3,8 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include +#include +#include #include #include diff --git a/cpp/include/nvforest/detail/raft_proto/ceildiv.hpp b/cpp/include/nvforest/detail/ceildiv.hpp similarity index 79% rename from cpp/include/nvforest/detail/raft_proto/ceildiv.hpp rename to cpp/include/nvforest/detail/ceildiv.hpp index 8f4fb6c..4d6245b 100644 --- a/cpp/include/nvforest/detail/raft_proto/ceildiv.hpp +++ b/cpp/include/nvforest/detail/ceildiv.hpp @@ -3,15 +3,15 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include +#include #include -namespace raft_proto { +namespace nvforest::detail { template HOST DEVICE auto constexpr ceildiv(T dividend, U divisor) { static_assert(std::is_integral_v && std::is_integral_v, "Arguments must be integers"); return dividend / divisor + (dividend % divisor != 0); } -} // namespace raft_proto +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/const_agnostic.hpp b/cpp/include/nvforest/detail/const_agnostic.hpp new file mode 100644 index 0000000..25d8a45 --- /dev/null +++ b/cpp/include/nvforest/detail/const_agnostic.hpp @@ -0,0 +1,16 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +#include + +namespace nvforest::detail { +template +using const_agnostic_same_t = + std::enable_if_t, std::remove_const_t>, V>; + +template +inline constexpr auto const_agnostic_same_v = + std::is_same_v, std::remove_const_t>; +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/copy.hpp b/cpp/include/nvforest/detail/copy.hpp similarity index 58% rename from cpp/include/nvforest/detail/raft_proto/detail/copy.hpp rename to cpp/include/nvforest/detail/copy.hpp index fffd3ac..b02a996 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/copy.hpp +++ b/cpp/include/nvforest/detail/copy.hpp @@ -3,39 +3,33 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include +#include +#include #include #ifdef NVFOREST_ENABLE_GPU -#include +#include #endif -#include +#include -namespace raft_proto { +namespace nvforest { template void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset) { - detail::copy(dst + dst_offset, src + src_offset, size, cuda_stream{}); + copy(dst + dst_offset, src + src_offset, size, cuda_stream{}); } template void copy( T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, cuda_stream stream) { - detail::copy(dst + dst_offset, src + src_offset, size, stream); + copy(dst + dst_offset, src + src_offset, size, stream); } template void copy(T* dst, T const* src, uint32_t size) { - detail::copy(dst, src, size, cuda_stream{}); -} - -template -void copy(T* dst, T const* src, uint32_t size, cuda_stream stream) -{ - detail::copy(dst, src, size, stream); + copy(dst, src, size, cuda_stream{}); } template @@ -49,17 +43,13 @@ void copy(T* dst, cuda_stream stream) { if (dst_type == device_type::gpu && src_type == device_type::gpu) { - detail::copy( - dst + dst_offset, src + src_offset, size, stream); + copy(dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::cpu) { - detail::copy( - dst + dst_offset, src + src_offset, size, stream); + copy(dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::gpu && src_type == device_type::cpu) { - detail::copy( - dst + dst_offset, src + src_offset, size, stream); + copy(dst + dst_offset, src + src_offset, size, stream); } else if (dst_type == device_type::cpu && src_type == device_type::gpu) { - detail::copy( - dst + dst_offset, src + src_offset, size, stream); + copy(dst + dst_offset, src + src_offset, size, stream); } } @@ -80,4 +70,4 @@ void copy(T* dst, copy(dst, src, size, dst_type, src_type, 0, 0, stream); } -} // namespace raft_proto +} // namespace nvforest diff --git a/cpp/include/nvforest/detail/raft_proto/detail/copy/cpu.hpp b/cpp/include/nvforest/detail/copy/cpu.hpp similarity index 71% rename from cpp/include/nvforest/detail/raft_proto/detail/copy/cpu.hpp rename to cpp/include/nvforest/detail/copy/cpu.hpp index c519c5b..796a97b 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/copy/cpu.hpp +++ b/cpp/include/nvforest/detail/copy/cpu.hpp @@ -3,16 +3,16 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include -#include +#include +#include +#include #include #include #include -namespace raft_proto::detail { +namespace nvforest { template std::enable_if_t, @@ -27,11 +27,11 @@ template std::enable_if_t< std::conjunction_v, std::bool_constant>, - std::bool_constant>, + std::bool_constant>, void> copy(T* dst, T const* src, uint32_t size, cuda_stream stream) { - throw gpu_unsupported("Copying from or to device in non-GPU build"); + throw detail::gpu_unsupported("Copying from or to device in non-GPU build"); } -} // namespace raft_proto::detail +} // namespace nvforest diff --git a/cpp/include/nvforest/detail/raft_proto/detail/copy/gpu.hpp b/cpp/include/nvforest/detail/copy/gpu.hpp similarity index 58% rename from cpp/include/nvforest/detail/raft_proto/detail/copy/gpu.hpp rename to cpp/include/nvforest/detail/copy/gpu.hpp index b5bcded..94ae559 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/copy/gpu.hpp +++ b/cpp/include/nvforest/detail/copy/gpu.hpp @@ -3,9 +3,9 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include -#include +#include +#include +#include #include @@ -13,17 +13,17 @@ #include -namespace raft_proto::detail { +namespace nvforest { template std::enable_if_t< std::conjunction_v, std::bool_constant>, - std::bool_constant>, + std::bool_constant>, void> copy(T* dst, T const* src, uint32_t size, cuda_stream stream) { - raft_proto::cuda_check(cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDefault, stream)); + detail::cuda_check(cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDefault, stream)); } -} // namespace raft_proto::detail +} // namespace nvforest diff --git a/cpp/include/nvforest/detail/cuda_check.hpp b/cpp/include/nvforest/detail/cuda_check.hpp new file mode 100644 index 0000000..b5c8e58 --- /dev/null +++ b/cpp/include/nvforest/detail/cuda_check.hpp @@ -0,0 +1,19 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +#include +#ifdef NVFOREST_ENABLE_GPU +#include +#endif +#include +#include + +namespace nvforest::detail { +template +void cuda_check(error_t const& err) noexcept(!GPU_ENABLED) +{ + cuda_check(err); +} +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/cuda_check/base.hpp b/cpp/include/nvforest/detail/cuda_check/base.hpp similarity index 64% rename from cpp/include/nvforest/detail/raft_proto/detail/cuda_check/base.hpp rename to cpp/include/nvforest/detail/cuda_check/base.hpp index 3f797d0..c697ddc 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/cuda_check/base.hpp +++ b/cpp/include/nvforest/detail/cuda_check/base.hpp @@ -3,13 +3,13 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include +#include -namespace raft_proto::detail { +namespace nvforest::detail { template void cuda_check(error_t const& err) { } -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/cuda_check/gpu.hpp b/cpp/include/nvforest/detail/cuda_check/gpu.hpp similarity index 61% rename from cpp/include/nvforest/detail/raft_proto/detail/cuda_check/gpu.hpp rename to cpp/include/nvforest/detail/cuda_check/gpu.hpp index e02b88e..917166c 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/cuda_check/gpu.hpp +++ b/cpp/include/nvforest/detail/cuda_check/gpu.hpp @@ -3,12 +3,12 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include -#include +#include +#include +#include #include -namespace raft_proto::detail { +namespace nvforest::detail { template <> inline void cuda_check(cudaError_t const& err) noexcept(false) @@ -19,4 +19,4 @@ inline void cuda_check(cudaError_t const& err) no } } -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/decision_forest_builder.hpp b/cpp/include/nvforest/detail/decision_forest_builder.hpp index 8d33ed7..0795cdd 100644 --- a/cpp/include/nvforest/detail/decision_forest_builder.hpp +++ b/cpp/include/nvforest/detail/decision_forest_builder.hpp @@ -3,13 +3,13 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include +#include #include +#include #include #include -#include -#include -#include -#include +#include #include #include @@ -115,7 +115,7 @@ struct decision_forest_builder { } if (max_num_categories_ > bin_width) { node_value = categorical_storage_.size(); - auto bins_required = raft_proto::ceildiv(max_cat_plus_one, bin_width); + auto bins_required = ceildiv(max_cat_plus_one, bin_width); categorical_storage_.push_back(max_cat_plus_one); categorical_storage_.resize(categorical_storage_.size() + bins_required); set_storage = &(categorical_storage_[node_value + 1]); @@ -228,12 +228,12 @@ struct decision_forest_builder { /* Return the nvForest decision forest built by this builder */ auto get_decision_forest(index_type num_feature, index_type num_class, - raft_proto::device_type mem_type = raft_proto::device_type::cpu, - int device = 0, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}) + device_type mem_type = device_type::cpu, + int device = 0, + cuda_stream stream = cuda_stream{}) { // Set device = -1 when loading the model onto CPU - if (mem_type == raft_proto::device_type::cpu) { device = -1; } + if (mem_type == device_type::cpu) { device = -1; } // Validate forest invariants the inference kernel relies on. After this // function returns, the forest is treated as trusted by the kernel. @@ -267,7 +267,7 @@ struct decision_forest_builder { " >= " + std::to_string(storage_size) + ")"}; } auto const stored_num_cats = categorical_storage_[offset]; - auto const bins_required = raft_proto::ceildiv(stored_num_cats, cat_bin_width); + auto const bins_required = ceildiv(stored_num_cats, cat_bin_width); auto const bits_begin = static_cast(offset) + std::size_t{1}; auto const bits_end = bits_begin + static_cast(bins_required); if (bits_end > storage_size) { @@ -295,31 +295,22 @@ struct decision_forest_builder { } return decision_forest_t{ - raft_proto::buffer{ - raft_proto::buffer{nodes_.data(), nodes_.size()}, mem_type, device, stream}, - raft_proto::buffer{raft_proto::buffer{root_node_indexes_.data(), root_node_indexes_.size()}, - mem_type, - device, - stream}, - raft_proto::buffer{raft_proto::buffer{node_id_mapping_.data(), node_id_mapping_.size()}, - mem_type, - device, - stream}, - raft_proto::buffer{raft_proto::buffer{bias_.data(), bias_.size()}, mem_type, device, stream}, + buffer{buffer{nodes_.data(), nodes_.size()}, mem_type, device, stream}, + buffer{ + buffer{root_node_indexes_.data(), root_node_indexes_.size()}, mem_type, device, stream}, + buffer{buffer{node_id_mapping_.data(), node_id_mapping_.size()}, mem_type, device, stream}, + buffer{buffer{bias_.data(), bias_.size()}, mem_type, device, stream}, num_feature, num_class, max_num_categories_ != 0, vector_output_.empty() ? std::nullopt - : std::make_optional>( - raft_proto::buffer{vector_output_.data(), vector_output_.size()}, - mem_type, - device, - stream), + : std::make_optional>( + buffer{vector_output_.data(), vector_output_.size()}, mem_type, device, stream), categorical_storage_.empty() ? std::nullopt - : std::make_optional>( - raft_proto::buffer{categorical_storage_.data(), categorical_storage_.size()}, + : std::make_optional>( + buffer{categorical_storage_.data(), categorical_storage_.size()}, mem_type, device, stream), diff --git a/cpp/include/nvforest/detail/device_id.hpp b/cpp/include/nvforest/detail/device_id.hpp new file mode 100644 index 0000000..76bd20d --- /dev/null +++ b/cpp/include/nvforest/detail/device_id.hpp @@ -0,0 +1,18 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#ifdef NVFOREST_ENABLE_GPU +#include +#endif +#include + +#include + +namespace nvforest::detail { +using device_id_variant = std::variant, device_id>; +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_id/base.hpp b/cpp/include/nvforest/detail/device_id/base.hpp similarity index 70% rename from cpp/include/nvforest/detail/raft_proto/detail/device_id/base.hpp rename to cpp/include/nvforest/detail/device_id/base.hpp index 214d1d8..fa56ad7 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/device_id/base.hpp +++ b/cpp/include/nvforest/detail/device_id/base.hpp @@ -3,9 +3,9 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include +#include -namespace raft_proto::detail { +namespace nvforest::detail { template struct device_id { using value_type = int; @@ -13,4 +13,4 @@ struct device_id { device_id(value_type device_index) {} auto value() const { return value_type{}; } }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_id/cpu.hpp b/cpp/include/nvforest/detail/device_id/cpu.hpp similarity index 66% rename from cpp/include/nvforest/detail/raft_proto/detail/device_id/cpu.hpp rename to cpp/include/nvforest/detail/device_id/cpu.hpp index 3c4eded..be7105a 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/device_id/cpu.hpp +++ b/cpp/include/nvforest/detail/device_id/cpu.hpp @@ -3,10 +3,10 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include +#include +#include -namespace raft_proto::detail { +namespace nvforest::detail { template <> struct device_id { using value_type = int; @@ -18,4 +18,4 @@ struct device_id { private: value_type id_; }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_id/gpu.hpp b/cpp/include/nvforest/detail/device_id/gpu.hpp similarity index 65% rename from cpp/include/nvforest/detail/raft_proto/detail/device_id/gpu.hpp rename to cpp/include/nvforest/detail/device_id/gpu.hpp index 6248878..3cc0d01 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/device_id/gpu.hpp +++ b/cpp/include/nvforest/detail/device_id/gpu.hpp @@ -3,20 +3,20 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include -#include +#include +#include +#include #include -namespace raft_proto::detail { +namespace nvforest::detail { template <> struct device_id { using value_type = typename rmm::cuda_device_id::value_type; device_id() noexcept(false) : id_{[]() { auto raw_id = value_type{}; - raft_proto::cuda_check(cudaGetDevice(&raw_id)); + cuda_check(cudaGetDevice(&raw_id)); return raw_id; }()} {}; device_id(value_type dev_id) noexcept : id_{dev_id} {}; @@ -26,4 +26,4 @@ struct device_id { private: rmm::cuda_device_id id_; }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/device_initialization.hpp b/cpp/include/nvforest/detail/device_initialization.hpp index bb1bff6..47a07ea 100644 --- a/cpp/include/nvforest/detail/device_initialization.hpp +++ b/cpp/include/nvforest/detail/device_initialization.hpp @@ -13,15 +13,15 @@ namespace nvforest::detail { /* Set any required device options for optimizing nvForest compute */ -template -void initialize_device(raft_proto::device_id device) +template +void initialize_device(device_id device) { device_initialization::initialize_device(device); } /* Set any required device options for optimizing nvForest compute */ template -void initialize_device(raft_proto::device_id_variant device) +void initialize_device(device_id_variant device) { std::visit( [](auto&& concrete_device) { diff --git a/cpp/include/nvforest/detail/device_initialization/cpu.hpp b/cpp/include/nvforest/detail/device_initialization/cpu.hpp index dd2747e..5ae78ff 100644 --- a/cpp/include/nvforest/detail/device_initialization/cpu.hpp +++ b/cpp/include/nvforest/detail/device_initialization/cpu.hpp @@ -4,9 +4,9 @@ */ #pragma once -#include -#include -#include +#include +#include +#include #include @@ -17,11 +17,11 @@ namespace nvforest::detail::device_initialization { * This specialization will also be used for non-GPU-enabled builds * (as a GPU no-op). */ -template -std::enable_if_t, - std::bool_constant>, - void> -initialize_device(raft_proto::device_id device) +template +std::enable_if_t< + std::disjunction_v, std::bool_constant>, + void> +initialize_device(device_id device) { } diff --git a/cpp/include/nvforest/detail/device_initialization/gpu.cuh b/cpp/include/nvforest/detail/device_initialization/gpu.cuh index e95ffea..001c627 100644 --- a/cpp/include/nvforest/detail/device_initialization/gpu.cuh +++ b/cpp/include/nvforest/detail/device_initialization/gpu.cuh @@ -5,15 +5,15 @@ #pragma once #include +#include +#include +#include #include #include +#include #include -#include -#include -#include -#include -#include #include +#include #include @@ -27,197 +27,178 @@ namespace nvforest::detail::device_initialization { * the inference kernels have access to the maximum available dynamic shared * memory. */ -template -std::enable_if_t, - std::bool_constant>, - void> -initialize_device(raft_proto::device_id device) +template +std::enable_if_t< + std::conjunction_v, std::bool_constant>, + void> +initialize_device(device_id device) { - auto device_context = raft_proto::device_setter(device); + auto device_context = device_setter(device); auto max_shared_mem_per_block = get_max_shared_mem_per_block(device); // Run solely for side-effect of caching SM count get_sm_count(device); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check(cudaFuncSetAttribute( + cuda_check(cudaFuncSetAttribute( infer_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); - raft_proto::cuda_check( - cudaFuncSetAttribute(infer_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); + cuda_check(cudaFuncSetAttribute(infer_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + max_shared_mem_per_block)); } NVFOREST_INITIALIZE_DEVICE(extern template, 0) diff --git a/cpp/include/nvforest/detail/device_initialization/gpu.hpp b/cpp/include/nvforest/detail/device_initialization/gpu.hpp index 431777a..5593fd4 100644 --- a/cpp/include/nvforest/detail/device_initialization/gpu.hpp +++ b/cpp/include/nvforest/detail/device_initialization/gpu.hpp @@ -4,10 +4,9 @@ */ #pragma once -#include -#include -#include -#include +#include +#include +#include #include @@ -16,10 +15,10 @@ namespace nvforest::detail::device_initialization { /* Non-CUDA header declaration of the GPU specialization for device * initialization */ -template -std::enable_if_t, - std::bool_constant>, - void> -initialize_device(raft_proto::device_id device); +template +std::enable_if_t< + std::conjunction_v, std::bool_constant>, + void> +initialize_device(device_id device); } // namespace nvforest::detail::device_initialization diff --git a/cpp/include/nvforest/detail/device_setter.hpp b/cpp/include/nvforest/detail/device_setter.hpp new file mode 100644 index 0000000..6a9c162 --- /dev/null +++ b/cpp/include/nvforest/detail/device_setter.hpp @@ -0,0 +1,10 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +#include +#ifdef NVFOREST_ENABLE_GPU +#include +#endif +#include diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_setter/base.hpp b/cpp/include/nvforest/detail/device_setter/base.hpp similarity index 62% rename from cpp/include/nvforest/detail/raft_proto/detail/device_setter/base.hpp rename to cpp/include/nvforest/detail/device_setter/base.hpp index 1fe46d8..069c62d 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/device_setter/base.hpp +++ b/cpp/include/nvforest/detail/device_setter/base.hpp @@ -3,10 +3,10 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include +#include +#include -namespace raft_proto::detail { +namespace nvforest::detail { /** Struct for setting current device within a code block */ template @@ -14,4 +14,4 @@ struct device_setter { device_setter(device_id device) {} }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_setter/gpu.hpp b/cpp/include/nvforest/detail/device_setter/gpu.hpp similarity index 53% rename from cpp/include/nvforest/detail/raft_proto/detail/device_setter/gpu.hpp rename to cpp/include/nvforest/detail/device_setter/gpu.hpp index a256538..40564fd 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/device_setter/gpu.hpp +++ b/cpp/include/nvforest/detail/device_setter/gpu.hpp @@ -3,28 +3,28 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include -#include -#include +#include +#include +#include +#include #include #include -namespace raft_proto::detail { +namespace nvforest::detail { /** Struct for setting current device within a code block */ template <> struct device_setter { - device_setter(raft_proto::device_id device) noexcept(false) + device_setter(device_id device) noexcept(false) : prev_device_{[]() { auto result = int{}; - raft_proto::cuda_check(cudaGetDevice(&result)); + cuda_check(cudaGetDevice(&result)); return result; }()} { - raft_proto::cuda_check(cudaSetDevice(device.value())); + cuda_check(cudaSetDevice(device.value())); } ~device_setter() { RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_.value())); } @@ -33,4 +33,4 @@ struct device_setter { device_id prev_device_; }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/evaluate_tree.hpp b/cpp/include/nvforest/detail/evaluate_tree.hpp index cd23b26..4b3d0bd 100644 --- a/cpp/include/nvforest/detail/evaluate_tree.hpp +++ b/cpp/include/nvforest/detail/evaluate_tree.hpp @@ -10,7 +10,7 @@ #include #endif #include -#include +#include namespace nvforest::detail { /* diff --git a/cpp/include/nvforest/detail/raft_proto/exceptions.hpp b/cpp/include/nvforest/detail/exceptions.hpp similarity index 67% rename from cpp/include/nvforest/detail/raft_proto/exceptions.hpp rename to cpp/include/nvforest/detail/exceptions.hpp index 9c9ab27..eebefb3 100644 --- a/cpp/include/nvforest/detail/raft_proto/exceptions.hpp +++ b/cpp/include/nvforest/detail/exceptions.hpp @@ -1,22 +1,11 @@ /* - * Copyright (c) 2023-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include -namespace raft_proto { +namespace nvforest::detail { struct bad_cuda_call : std::exception { bad_cuda_call() : bad_cuda_call("CUDA API call failed") {} bad_cuda_call(char const* msg) : msg_{msg} {} @@ -64,4 +53,4 @@ struct wrong_device : std::exception { char const* msg_; }; -} // namespace raft_proto +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/forest.hpp b/cpp/include/nvforest/detail/forest.hpp index a6c141b..6ad4c86 100644 --- a/cpp/include/nvforest/detail/forest.hpp +++ b/cpp/include/nvforest/detail/forest.hpp @@ -3,9 +3,9 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include #include #include -#include #include diff --git a/cpp/include/nvforest/detail/gpu_introspection.hpp b/cpp/include/nvforest/detail/gpu_introspection.hpp index 2c8537f..891bff0 100644 --- a/cpp/include/nvforest/detail/gpu_introspection.hpp +++ b/cpp/include/nvforest/detail/gpu_introspection.hpp @@ -3,10 +3,10 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include +#include #include -#include -#include -#include +#include #include @@ -14,72 +14,69 @@ namespace nvforest::detail { -inline auto get_max_shared_mem_per_block( - raft_proto::device_id device_id) +inline auto get_max_shared_mem_per_block(device_id device_id) { auto thread_local cache = std::vector{}; if (cache.size() == 0) { auto device_count = int{}; - raft_proto::cuda_check(cudaGetDeviceCount(&device_count)); + cuda_check(cudaGetDeviceCount(&device_count)); cache.resize(device_count); for (auto dev = 0; dev < device_count; ++dev) { - raft_proto::cuda_check( + cuda_check( cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev)); } } return index_type(cache.at(device_id.value())); } -inline auto get_sm_count(raft_proto::device_id device_id) +inline auto get_sm_count(device_id device_id) { auto thread_local cache = std::vector{}; if (cache.size() == 0) { auto device_count = int{}; - raft_proto::cuda_check(cudaGetDeviceCount(&device_count)); + cuda_check(cudaGetDeviceCount(&device_count)); cache.resize(device_count); for (auto dev = 0; dev < device_count; ++dev) { - raft_proto::cuda_check( - cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev)); + cuda_check(cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev)); } } return index_type(cache.at(device_id.value())); } -inline auto get_max_threads_per_sm(raft_proto::device_id device_id) +inline auto get_max_threads_per_sm(device_id device_id) { auto result = int{}; - raft_proto::cuda_check( + cuda_check( cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor, device_id.value())); return index_type(result); } -inline auto get_max_shared_mem_per_sm(raft_proto::device_id device_id) +inline auto get_max_shared_mem_per_sm(device_id device_id) { auto thread_local cache = std::vector{}; if (cache.size() == 0) { auto device_count = int{}; - raft_proto::cuda_check(cudaGetDeviceCount(&device_count)); + cuda_check(cudaGetDeviceCount(&device_count)); cache.resize(device_count); for (auto dev = 0; dev < device_count; ++dev) { - raft_proto::cuda_check( + cuda_check( cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev)); } } return index_type(cache.at(device_id.value())); } -inline auto get_mem_clock_rate(raft_proto::device_id device_id) +inline auto get_mem_clock_rate(device_id device_id) { auto result = int{}; - raft_proto::cuda_check( - cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate, device_id.value())); + cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate, device_id.value())); return index_type(result); } -inline auto get_core_clock_rate(raft_proto::device_id device_id) +inline auto get_core_clock_rate(device_id device_id) { auto result = int{}; - raft_proto::cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrClockRate, device_id.value())); + cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrClockRate, device_id.value())); return index_type(result); } diff --git a/cpp/include/nvforest/detail/raft_proto/gpu_support.hpp b/cpp/include/nvforest/detail/gpu_support.hpp similarity index 94% rename from cpp/include/nvforest/detail/raft_proto/gpu_support.hpp rename to cpp/include/nvforest/detail/gpu_support.hpp index 6d03363..9994632 100644 --- a/cpp/include/nvforest/detail/raft_proto/gpu_support.hpp +++ b/cpp/include/nvforest/detail/gpu_support.hpp @@ -8,7 +8,7 @@ #include #include -namespace raft_proto { +namespace nvforest::detail { #ifdef NVFOREST_ENABLE_GPU auto constexpr static const GPU_ENABLED = true; #else @@ -42,4 +42,4 @@ struct gpu_unsupported : std::exception { char const* msg_; }; -} // namespace raft_proto +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/host_only_throw.hpp b/cpp/include/nvforest/detail/host_only_throw.hpp new file mode 100644 index 0000000..beb4e34 --- /dev/null +++ b/cpp/include/nvforest/detail/host_only_throw.hpp @@ -0,0 +1,8 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +#include +#include +#include diff --git a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/base.hpp b/cpp/include/nvforest/detail/host_only_throw/base.hpp similarity index 66% rename from cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/base.hpp rename to cpp/include/nvforest/detail/host_only_throw/base.hpp index b6c81c5..607eac6 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/base.hpp +++ b/cpp/include/nvforest/detail/host_only_throw/base.hpp @@ -3,10 +3,10 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include +#include -namespace raft_proto::detail { -template +namespace nvforest::detail { +template struct host_only_throw { template host_only_throw(Args&&... args) @@ -14,4 +14,4 @@ struct host_only_throw { static_assert(host); // Do not allow constexpr branch to compile if !host } }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/cpu.hpp b/cpp/include/nvforest/detail/host_only_throw/cpu.hpp similarity index 62% rename from cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/cpu.hpp rename to cpp/include/nvforest/detail/host_only_throw/cpu.hpp index 3dad51f..d6b893e 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/cpu.hpp +++ b/cpp/include/nvforest/detail/host_only_throw/cpu.hpp @@ -3,10 +3,10 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include +#include +#include -namespace raft_proto::detail { +namespace nvforest::detail { template struct host_only_throw { template @@ -15,4 +15,4 @@ struct host_only_throw { throw T{std::forward(args)...}; } }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/infer.hpp b/cpp/include/nvforest/detail/infer.hpp index d4005fd..553105d 100644 --- a/cpp/include/nvforest/detail/infer.hpp +++ b/cpp/include/nvforest/detail/infer.hpp @@ -3,12 +3,12 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include +#include #include #include #include -#include -#include -#include +#include #include #include @@ -51,7 +51,7 @@ namespace nvforest::detail { * @param device The device on which to execute evaluation * @param stream Optionally, the CUDA stream to use */ -template +template void infer(forest_t const& forest, postprocessor const& postproc, typename forest_t::io_type* output, @@ -64,8 +64,8 @@ void infer(forest_t const& forest, typename forest_t::node_type::index_type* categorical_data = nullptr, infer_kind infer_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt, - raft_proto::device_id device = raft_proto::device_id{}, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}) + device_id device = device_id{}, + cuda_stream stream = cuda_stream{}) { if (vector_output == nullptr) { if (categorical_data == nullptr) { diff --git a/cpp/include/nvforest/detail/infer/cpu.hpp b/cpp/include/nvforest/detail/infer/cpu.hpp index b3142d1..5e1f8c7 100644 --- a/cpp/include/nvforest/detail/infer/cpu.hpp +++ b/cpp/include/nvforest/detail/infer/cpu.hpp @@ -5,16 +5,16 @@ #pragma once #include +#include #include +#include #include +#include #include #include #include -#include -#include -#include -#include #include +#include #include #include @@ -62,14 +62,14 @@ namespace nvforest::detail::inference { * (for individual row inference) to 512 (for very large batch * inference). A value of 64 is a generally-useful default. */ -template -std::enable_if_t, - std::bool_constant>, - void> +std::enable_if_t< + std::disjunction_v, std::bool_constant>, + void> infer(forest_t const& forest, postprocessor const& postproc, typename forest_t::io_type* output, @@ -81,11 +81,11 @@ infer(forest_t const& forest, categorical_data_t categorical_data = nullptr, infer_kind infer_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt, - raft_proto::device_id device = raft_proto::device_id{}, - raft_proto::cuda_stream = raft_proto::cuda_stream{}) + device_id device = device_id{}, + cuda_stream = cuda_stream{}) { - if constexpr (D == raft_proto::device_type::gpu) { - throw raft_proto::gpu_unsupported("Tried to use GPU inference in CPU-only build"); + if constexpr (D == device_type::gpu) { + throw gpu_unsupported("Tried to use GPU inference in CPU-only build"); } else { if (infer_type == infer_kind::leaf_id) { infer_kernel_cpu( @@ -124,17 +124,17 @@ infer(forest_t const& forest, * compiled as few times as possible. A macro is used because ever * specialization must be explicitly declared. The final argument to the macro * references the 8 specialization variants compiled in standard nvForest. */ -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 0) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 1) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 2) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 3) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 4) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 5) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 6) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 7) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 8) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 9) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 10) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 11) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 0) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 1) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 2) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 3) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 4) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 5) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 6) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 7) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 8) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 9) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 10) +NVFOREST_INFER_ALL(extern template, device_type::cpu, 11) } // namespace nvforest::detail::inference diff --git a/cpp/include/nvforest/detail/infer/gpu.cuh b/cpp/include/nvforest/detail/infer/gpu.cuh index def3f8d..16b8f52 100644 --- a/cpp/include/nvforest/detail/infer/gpu.cuh +++ b/cpp/include/nvforest/detail/infer/gpu.cuh @@ -3,21 +3,21 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include #include +#include +#include +#include +#include #include #include +#include #include #include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include #include #include @@ -36,7 +36,7 @@ inline auto compute_output_size(index_type row_output_size, { auto result = row_output_size * rows_per_block_iteration; if (infer_type == infer_kind::default_kind) { - result *= raft_proto::ceildiv(threads_per_block, rows_per_block_iteration); + result *= ceildiv(threads_per_block, rows_per_block_iteration); } return result; } @@ -77,12 +77,12 @@ inline auto compute_output_size(index_type row_output_size, * value depends on hardware, model, and batch size. Valid values are any power * of 2 from 1 to 32. */ -template -std::enable_if_t infer( +std::enable_if_t infer( forest_t const& forest, postprocessor const& postproc, typename forest_t::io_type* output, @@ -94,8 +94,8 @@ std::enable_if_t infer( categorical_data_t categorical_data = nullptr, infer_kind infer_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt, - raft_proto::device_id device = raft_proto::device_id{}, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}) + device_id device = device_id{}, + cuda_stream stream = cuda_stream{}) { using output_t = typename forest_t::template raw_output_type; @@ -114,15 +114,15 @@ std::enable_if_t infer( // block. auto threads_per_block = min(MAX_THREADS_PER_BLOCK, - raft_proto::downpadded_size( - (max_shared_mem_per_block - row_size_bytes) / row_output_size_bytes, WARP_SIZE)); + downpadded_size((max_shared_mem_per_block - row_size_bytes) / row_output_size_bytes, + WARP_SIZE)); // If we cannot do at least a warp per block when storing input rows in // shared mem, recalculate our threads per block without input storage if (threads_per_block < WARP_SIZE) { threads_per_block = min(MAX_THREADS_PER_BLOCK, - raft_proto::downpadded_size(max_shared_mem_per_block / row_output_size_bytes, WARP_SIZE)); + downpadded_size(max_shared_mem_per_block / row_output_size_bytes, WARP_SIZE)); if (threads_per_block >= WARP_SIZE) { row_size_bytes = index_type{}; // Do not store input rows in shared mem } @@ -131,9 +131,8 @@ std::enable_if_t infer( // If we cannot do at least a warp per block when storing output in // shared mem, recalculate our threads per block with ONLY input storage if (threads_per_block < WARP_SIZE) { - threads_per_block = - min(MAX_THREADS_PER_BLOCK, - raft_proto::downpadded_size(max_shared_mem_per_block / row_size_bytes, WARP_SIZE)); + threads_per_block = min(MAX_THREADS_PER_BLOCK, + downpadded_size(max_shared_mem_per_block / row_size_bytes, WARP_SIZE)); } // If we still cannot use at least a warp per block, give up on using @@ -149,7 +148,7 @@ std::enable_if_t infer( auto output_workspace_size = compute_output_size(row_output_size, threads_per_block, rows_per_block_iteration, infer_type); auto output_workspace_size_bytes = output_item_bytes * output_workspace_size; - auto global_workspace = raft_proto::buffer{}; + auto global_workspace = buffer{}; if (output_workspace_size_bytes > max_shared_mem_per_block) { output_workspace_size_bytes = 0; @@ -160,7 +159,7 @@ std::enable_if_t infer( max_overall_shared_mem); auto resident_blocks_per_sm = - min(raft_proto::ceildiv(max_shared_mem_per_sm, shared_mem_per_block), max_resident_blocks); + min(ceildiv(max_shared_mem_per_sm, shared_mem_per_block), max_resident_blocks); // If caller has not specified the number of rows per block iteration, apply // the following heuristic to identify an approximately optimal value @@ -188,10 +187,10 @@ std::enable_if_t infer( shared_mem_per_block = std::min( max_overall_shared_mem, max_shared_mem_per_sm / (max_shared_mem_per_sm / shared_mem_per_block)); - auto num_blocks = std::min(raft_proto::ceildiv(row_count, rows_per_block_iteration), MAX_BLOCKS); + auto num_blocks = std::min(ceildiv(row_count, rows_per_block_iteration), MAX_BLOCKS); if (row_output_size == 0) { - global_workspace = raft_proto::buffer{ - output_workspace_size * num_blocks, raft_proto::device_type::gpu, device.value(), stream}; + global_workspace = buffer{ + output_workspace_size * num_blocks, device_type::gpu, device.value(), stream}; } /** @@ -209,8 +208,8 @@ std::enable_if_t infer( auto num_grove = [infer_type, threads_per_block, task_count, chunk_size]() { auto result = std::uint64_t{1}; if (infer_type == infer_kind::default_kind) { - result = raft_proto::ceildiv(min(static_cast(threads_per_block), task_count), - chunk_size); + result = + ceildiv(min(static_cast(threads_per_block), task_count), chunk_size); } return result; }(); @@ -318,7 +317,7 @@ std::enable_if_t infer( infer_type, global_workspace.data()); } - raft_proto::cuda_check(cudaGetLastError()); + cuda_check(cudaGetLastError()); } /* This macro is invoked here to declare all standard specializations of this @@ -326,13 +325,13 @@ std::enable_if_t infer( * compiled as few times as possible. A macro is used because ever * specialization must be explicitly declared. The final argument to the macro * references the 8 specialization variants compiled in nvForest. */ -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 0) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 1) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 2) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 3) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 4) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 5) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 6) -NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 7) +NVFOREST_INFER_ALL(extern template, device_type::gpu, 0) +NVFOREST_INFER_ALL(extern template, device_type::gpu, 1) +NVFOREST_INFER_ALL(extern template, device_type::gpu, 2) +NVFOREST_INFER_ALL(extern template, device_type::gpu, 3) +NVFOREST_INFER_ALL(extern template, device_type::gpu, 4) +NVFOREST_INFER_ALL(extern template, device_type::gpu, 5) +NVFOREST_INFER_ALL(extern template, device_type::gpu, 6) +NVFOREST_INFER_ALL(extern template, device_type::gpu, 7) } // namespace nvforest::detail::inference diff --git a/cpp/include/nvforest/detail/infer/gpu.hpp b/cpp/include/nvforest/detail/infer/gpu.hpp index b68fdf4..5de10d4 100644 --- a/cpp/include/nvforest/detail/infer/gpu.hpp +++ b/cpp/include/nvforest/detail/infer/gpu.hpp @@ -2,12 +2,12 @@ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ +#include +#include #include #include #include -#include -#include -#include +#include #include #include @@ -16,12 +16,12 @@ namespace nvforest::detail::inference { /* The CUDA-free header declaration of the GPU infer template */ -template -std::enable_if_t infer( +std::enable_if_t infer( forest_t const& forest, postprocessor const& postproc, typename forest_t::io_type* output, @@ -33,7 +33,7 @@ std::enable_if_t infer( categorical_data_t categorical_data = nullptr, infer_kind infer_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt, - raft_proto::device_id device = raft_proto::device_id{}, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}); + device_id device = device_id{}, + cuda_stream stream = cuda_stream{}); } // namespace nvforest::detail::inference diff --git a/cpp/include/nvforest/detail/infer_kernel/cpu.hpp b/cpp/include/nvforest/detail/infer_kernel/cpu.hpp index b3db750..5c82d26 100644 --- a/cpp/include/nvforest/detail/infer_kernel/cpu.hpp +++ b/cpp/include/nvforest/detail/infer_kernel/cpu.hpp @@ -3,11 +3,11 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include #include #include #include #include -#include #include #include @@ -94,8 +94,8 @@ void infer_kernel_cpu(forest_t const& forest, using output_t = typename forest_t::template raw_output_type; auto const num_tree = forest.tree_count(); - auto const num_grove = raft_proto::ceildiv(num_tree, grove_size); - auto const num_chunk = raft_proto::ceildiv(row_count, chunk_size); + auto const num_grove = ceildiv(num_tree, grove_size); + auto const num_chunk = ceildiv(row_count, chunk_size); /** * Throw an error for large inputs that would cause integer overflow. diff --git a/cpp/include/nvforest/detail/infer_kernel/gpu.cuh b/cpp/include/nvforest/detail/infer_kernel/gpu.cuh index 2e11809..b221293 100644 --- a/cpp/include/nvforest/detail/infer_kernel/gpu.cuh +++ b/cpp/include/nvforest/detail/infer_kernel/gpu.cuh @@ -3,13 +3,13 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include #include #include #include #include +#include #include -#include -#include #include #include @@ -109,7 +109,7 @@ NVFOREST_KERNEL void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) auto task_count = chunk_size * forest.tree_count(); - auto num_grove = raft_proto::ceildiv(min(index_type(blockDim.x), task_count), chunk_size) * + auto num_grove = ceildiv(min(index_type(blockDim.x), task_count), chunk_size) * (infer_type == infer_kind::default_kind) + (infer_type != infer_kind::default_kind); @@ -121,7 +121,7 @@ NVFOREST_KERNEL void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) // deadlock on __syncthreads, so we round the task_count up to the next // multiple of the number of threads in this block. We then only perform // work within the loop if the task_index is below the actual task_count. - auto const task_count_rounded_up = blockDim.x * raft_proto::ceildiv(task_count, blockDim.x); + auto const task_count_rounded_up = blockDim.x * ceildiv(task_count, blockDim.x); // Infer on each tree and row for (auto task_index = threadIdx.x; task_index < task_count_rounded_up; @@ -175,7 +175,7 @@ NVFOREST_KERNEL void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) __syncthreads(); } - auto padded_num_groves = raft_proto::padded_size(num_grove, WARP_SIZE); + auto padded_num_groves = padded_size(num_grove, WARP_SIZE); for (auto row_index = threadIdx.x / WARP_SIZE; row_index < rows_in_this_iteration; row_index += blockDim.x / WARP_SIZE) { for (auto class_index = index_type{}; class_index < num_outputs; ++class_index) { diff --git a/cpp/include/nvforest/detail/node.hpp b/cpp/include/nvforest/detail/node.hpp index 5cdf73f..f3b97d5 100644 --- a/cpp/include/nvforest/detail/node.hpp +++ b/cpp/include/nvforest/detail/node.hpp @@ -4,8 +4,8 @@ */ #pragma once +#include #include -#include #include #include diff --git a/cpp/include/nvforest/detail/non_owning_buffer.hpp b/cpp/include/nvforest/detail/non_owning_buffer.hpp new file mode 100644 index 0000000..f552e35 --- /dev/null +++ b/cpp/include/nvforest/detail/non_owning_buffer.hpp @@ -0,0 +1,7 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +#include +#include diff --git a/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer/base.hpp b/cpp/include/nvforest/detail/non_owning_buffer/base.hpp similarity index 82% rename from cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer/base.hpp rename to cpp/include/nvforest/detail/non_owning_buffer/base.hpp index ae1ce76..f851d7c 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer/base.hpp +++ b/cpp/include/nvforest/detail/non_owning_buffer/base.hpp @@ -3,12 +3,12 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include +#include #include #include -namespace raft_proto::detail { +namespace nvforest::detail { template struct non_owning_buffer { // TODO(wphicks): Assess need for buffers of const T @@ -23,4 +23,4 @@ struct non_owning_buffer { // TODO(wphicks): Back this with RMM-allocated host memory T* data_; }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/owning_buffer.hpp b/cpp/include/nvforest/detail/owning_buffer.hpp new file mode 100644 index 0000000..90f1d13 --- /dev/null +++ b/cpp/include/nvforest/detail/owning_buffer.hpp @@ -0,0 +1,10 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +#include +#include +#ifdef NVFOREST_ENABLE_GPU +#include +#endif diff --git a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/base.hpp b/cpp/include/nvforest/detail/owning_buffer/base.hpp similarity index 62% rename from cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/base.hpp rename to cpp/include/nvforest/detail/owning_buffer/base.hpp index b4d02d1..aa5f24b 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/base.hpp +++ b/cpp/include/nvforest/detail/owning_buffer/base.hpp @@ -3,13 +3,13 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include -#include +#include +#include +#include #include -namespace raft_proto::detail { +namespace nvforest::detail { template struct owning_buffer { @@ -18,4 +18,4 @@ struct owning_buffer { auto* get() const { return static_cast(nullptr); } }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/cpu.hpp b/cpp/include/nvforest/detail/owning_buffer/cpu.hpp similarity index 72% rename from cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/cpu.hpp rename to cpp/include/nvforest/detail/owning_buffer/cpu.hpp index d7be3e3..cc54462 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/cpu.hpp +++ b/cpp/include/nvforest/detail/owning_buffer/cpu.hpp @@ -3,14 +3,14 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include -#include +#include +#include +#include #include #include -namespace raft_proto::detail { +namespace nvforest::detail { template struct owning_buffer { // TODO(wphicks): Assess need for buffers of const T @@ -26,4 +26,4 @@ struct owning_buffer { // TODO(wphicks): Back this with RMM-allocated host memory std::unique_ptr data_; }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/gpu.hpp b/cpp/include/nvforest/detail/owning_buffer/gpu.hpp similarity index 75% rename from cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/gpu.hpp rename to cpp/include/nvforest/detail/owning_buffer/gpu.hpp index 01bc57a..8419452 100644 --- a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/gpu.hpp +++ b/cpp/include/nvforest/detail/owning_buffer/gpu.hpp @@ -3,10 +3,10 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include -#include -#include +#include +#include +#include +#include #include @@ -14,7 +14,7 @@ #include -namespace raft_proto::detail { +namespace nvforest::detail { template struct owning_buffer { // TODO(wphicks): Assess need for buffers of const T @@ -36,4 +36,4 @@ struct owning_buffer { private: mutable rmm::device_buffer data_; }; -} // namespace raft_proto::detail +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/raft_proto/padding.hpp b/cpp/include/nvforest/detail/padding.hpp similarity index 91% rename from cpp/include/nvforest/detail/raft_proto/padding.hpp rename to cpp/include/nvforest/detail/padding.hpp index 2ceb32f..8419cfc 100644 --- a/cpp/include/nvforest/detail/raft_proto/padding.hpp +++ b/cpp/include/nvforest/detail/padding.hpp @@ -3,9 +3,9 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include +#include -namespace raft_proto { +namespace nvforest::detail { /* Return the value that must be added to val to equal the next multiple of * alignment greater than or equal to val */ @@ -45,4 +45,4 @@ HOST DEVICE auto downpadded_size(T val, U alignment) return val - downpadding_size(val, alignment); } -} // namespace raft_proto +} // namespace nvforest::detail diff --git a/cpp/include/nvforest/detail/postprocessor.hpp b/cpp/include/nvforest/detail/postprocessor.hpp index cdf24ca..9b7cb84 100644 --- a/cpp/include/nvforest/detail/postprocessor.hpp +++ b/cpp/include/nvforest/detail/postprocessor.hpp @@ -4,8 +4,8 @@ */ #pragma once +#include #include -#include #include #include diff --git a/cpp/include/nvforest/detail/raft_proto/cuda_check.hpp b/cpp/include/nvforest/detail/raft_proto/cuda_check.hpp deleted file mode 100644 index ab294dc..0000000 --- a/cpp/include/nvforest/detail/raft_proto/cuda_check.hpp +++ /dev/null @@ -1,19 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once -#include -#ifdef NVFOREST_ENABLE_GPU -#include -#endif -#include -#include - -namespace raft_proto { -template -void cuda_check(error_t const& err) noexcept(!GPU_ENABLED) -{ - detail::cuda_check(err); -} -} // namespace raft_proto diff --git a/cpp/include/nvforest/detail/raft_proto/detail/const_agnostic.hpp b/cpp/include/nvforest/detail/raft_proto/detail/const_agnostic.hpp deleted file mode 100644 index 6d61711..0000000 --- a/cpp/include/nvforest/detail/raft_proto/detail/const_agnostic.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2023-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include - -namespace raft_proto { -template -using const_agnostic_same_t = - std::enable_if_t, std::remove_const_t>, V>; - -template -inline constexpr auto const_agnostic_same_v = - std::is_same_v, std::remove_const_t>; -} // namespace raft_proto diff --git a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw.hpp b/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw.hpp deleted file mode 100644 index 3524cc4..0000000 --- a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw.hpp +++ /dev/null @@ -1,13 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once -#include -#include -#include - -namespace raft_proto { -template -using host_only_throw = detail::host_only_throw; -} // namespace raft_proto diff --git a/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer.hpp b/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer.hpp deleted file mode 100644 index 16d3d7c..0000000 --- a/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer.hpp +++ /dev/null @@ -1,12 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once -#include -#include - -namespace raft_proto { -template -using non_owning_buffer = detail::non_owning_buffer; -} // namespace raft_proto diff --git a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer.hpp b/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer.hpp deleted file mode 100644 index a8e0797..0000000 --- a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer.hpp +++ /dev/null @@ -1,14 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once -#include -#include -#ifdef NVFOREST_ENABLE_GPU -#include -#endif -namespace raft_proto { -template -using owning_buffer = detail::owning_buffer; -} // namespace raft_proto diff --git a/cpp/include/nvforest/detail/raft_proto/device_id.hpp b/cpp/include/nvforest/detail/raft_proto/device_id.hpp deleted file mode 100644 index 0f6b1d9..0000000 --- a/cpp/include/nvforest/detail/raft_proto/device_id.hpp +++ /dev/null @@ -1,21 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once - -#include -#include -#ifdef NVFOREST_ENABLE_GPU -#include -#endif -#include - -#include - -namespace raft_proto { -template -using device_id = detail::device_id; - -using device_id_variant = std::variant, device_id>; -} // namespace raft_proto diff --git a/cpp/include/nvforest/detail/raft_proto/device_setter.hpp b/cpp/include/nvforest/detail/raft_proto/device_setter.hpp deleted file mode 100644 index fd729ff..0000000 --- a/cpp/include/nvforest/detail/raft_proto/device_setter.hpp +++ /dev/null @@ -1,16 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once -#include -#ifdef NVFOREST_ENABLE_GPU -#include -#endif -#include - -namespace raft_proto { - -using device_setter = detail::device_setter; - -} diff --git a/cpp/include/nvforest/detail/raft_proto/device_type.hpp b/cpp/include/nvforest/detail/raft_proto/device_type.hpp deleted file mode 100644 index 730ce48..0000000 --- a/cpp/include/nvforest/detail/raft_proto/device_type.hpp +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2023-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -namespace raft_proto { -enum class device_type { cpu, gpu }; -} diff --git a/cpp/include/nvforest/detail/specializations/device_initialization_macros.hpp b/cpp/include/nvforest/detail/specializations/device_initialization_macros.hpp index 578d44e..800de94 100644 --- a/cpp/include/nvforest/detail/specializations/device_initialization_macros.hpp +++ b/cpp/include/nvforest/detail/specializations/device_initialization_macros.hpp @@ -3,12 +3,11 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include -#include +#include #include +#include /* Declare device initialization function for the types specified by the given * variant index */ -#define NVFOREST_INITIALIZE_DEVICE(template_type, variant_index) \ - template_type void \ - initialize_device( \ - raft_proto::device_id); +#define NVFOREST_INITIALIZE_DEVICE(template_type, variant_index) \ + template_type void initialize_device( \ + device_id); diff --git a/cpp/include/nvforest/detail/specializations/infer_macros.hpp b/cpp/include/nvforest/detail/specializations/infer_macros.hpp index 1fba70f..7e18c9c 100644 --- a/cpp/include/nvforest/detail/specializations/infer_macros.hpp +++ b/cpp/include/nvforest/detail/specializations/infer_macros.hpp @@ -4,14 +4,14 @@ */ #pragma once #include +#include +#include #include #include #include -#include -#include -#include #include #include +#include #include #include @@ -31,8 +31,8 @@ std::nullptr_t, \ infer_kind, \ std::optional, \ - raft_proto::device_id, \ - raft_proto::cuda_stream stream) + device_id, \ + cuda_stream stream) /* Macro which expands to the valid arguments to an inference call for a forest * model with vector leaves but without non-local categorical data.*/ @@ -48,8 +48,8 @@ std::nullptr_t, \ infer_kind, \ std::optional, \ - raft_proto::device_id, \ - raft_proto::cuda_stream stream) + device_id, \ + cuda_stream stream) /* Macro which expands to the valid arguments to an inference call for a forest * model without vector leaves but with non-local categorical data.*/ @@ -65,8 +65,8 @@ NVFOREST_SPEC(variant_index)::index_type*, \ infer_kind, \ std::optional, \ - raft_proto::device_id, \ - raft_proto::cuda_stream stream) + device_id, \ + cuda_stream stream) /* Macro which expands to the valid arguments to an inference call for a forest * model with vector leaves and with non-local categorical data.*/ @@ -82,8 +82,8 @@ NVFOREST_SPEC(variant_index)::index_type*, \ infer_kind, \ std::optional, \ - raft_proto::device_id, \ - raft_proto::cuda_stream stream) + device_id, \ + cuda_stream stream) /* Macro which expands to the declaration of an inference template for a forest * of the type indicated by the variant index */ diff --git a/cpp/include/nvforest/device_type.hpp b/cpp/include/nvforest/device_type.hpp new file mode 100644 index 0000000..1478d6e --- /dev/null +++ b/cpp/include/nvforest/device_type.hpp @@ -0,0 +1,8 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once +namespace nvforest { +enum class device_type { cpu, gpu }; +} // namespace nvforest diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp index 9c07f13..d3fc7b2 100644 --- a/cpp/include/nvforest/forest_model.hpp +++ b/cpp/include/nvforest/forest_model.hpp @@ -3,12 +3,16 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include +#include #include +#include +#include +#include +#include #include -#include -#include -#include -#include +#include +#include #include #ifdef NVFOREST_ENABLE_GPU @@ -116,7 +120,7 @@ struct forest_model { * @param[out] output The buffer where model output should be stored. * This must be of size at least ROWS x num_outputs(). * @param[in] input The buffer containing input data. - * @param[in] stream A raft_proto::cuda_stream, which (on GPU-enabled builds) is + * @param[in] stream A nvforest::cuda_stream, which (on GPU-enabled builds) is * a transparent wrapper for the cudaStream_t or (on CPU-only builds) a * CUDA-free placeholder object. * @param[in] predict_type Type of inference to perform. Defaults to summing @@ -134,9 +138,9 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(raft_proto::buffer& output, - raft_proto::buffer const& input, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}, + void predict(buffer& output, + buffer const& input, + cuda_stream stream = cuda_stream{}, infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { @@ -157,7 +161,7 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t + * @param[in] handle The nvforest::handle_t (wrapper for raft::handle_t * on GPU) which will be used to provide streams for evaluation. * @param[out] output The buffer where model output should be stored. If * this buffer is on host while the model is on device or vice versa, @@ -182,9 +186,9 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(raft_proto::handle_t const& handle, - raft_proto::buffer& output, - raft_proto::buffer const& input, + void predict(handle_t const& handle, + buffer& output, + buffer const& input, infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { @@ -202,48 +206,44 @@ struct forest_model { auto row_count = input.size() / num_features(); auto partition_size = - std::max(raft_proto::ceildiv(row_count, handle.get_usable_stream_count()), + std::max(detail::ceildiv(row_count, handle.get_usable_stream_count()), specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION); - auto partition_count = raft_proto::ceildiv(row_count, partition_size); + auto partition_count = detail::ceildiv(row_count, partition_size); for (auto i = std::size_t{}; i < partition_count; ++i) { auto stream = handle.get_next_usable_stream(); auto rows_in_this_partition = std::min(partition_size, row_count - i * partition_size); - auto partition_in = raft_proto::buffer{}; + auto partition_in = buffer{}; if (input.memory_type() != memory_type()) { - partition_in = - raft_proto::buffer{rows_in_this_partition * num_features(), memory_type()}; - raft_proto::copy(partition_in, - input, - 0, - i * partition_size * num_features(), - partition_in.size(), - stream); + partition_in = buffer{rows_in_this_partition * num_features(), memory_type()}; + copy(partition_in, + input, + 0, + i * partition_size * num_features(), + partition_in.size(), + stream); } else { - partition_in = - raft_proto::buffer{input.data() + i * partition_size * num_features(), - rows_in_this_partition * num_features(), - memory_type()}; + partition_in = buffer{input.data() + i * partition_size * num_features(), + rows_in_this_partition * num_features(), + memory_type()}; } - auto partition_out = raft_proto::buffer{}; + auto partition_out = buffer{}; if (output.memory_type() != memory_type()) { - partition_out = - raft_proto::buffer{rows_in_this_partition * num_outputs(), memory_type()}; + partition_out = buffer{rows_in_this_partition * num_outputs(), memory_type()}; } else { - partition_out = - raft_proto::buffer{output.data() + i * partition_size * num_outputs(), - rows_in_this_partition * num_outputs(), - memory_type()}; + partition_out = buffer{output.data() + i * partition_size * num_outputs(), + rows_in_this_partition * num_outputs(), + memory_type()}; } concrete_forest.predict( partition_out, partition_in, stream, predict_type, specified_chunk_size); if (output.memory_type() != memory_type()) { - raft_proto::copy(output, - partition_out, - i * partition_size * num_outputs(), - 0, - partition_out.size(), - stream); + copy(output, + partition_out, + i * partition_size * num_outputs(), + 0, + partition_out.size(), + stream); } } } @@ -257,7 +257,7 @@ struct forest_model { /** * Perform inference on given input * - * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t + * @param[in] handle The nvforest::handle_t (wrapper for raft::handle_t * on GPU) which will be used to provide streams for evaluation. * @param[out] output Pointer to the memory location where output should end * up @@ -281,30 +281,27 @@ struct forest_model { * reasonable value. On CPU, this argument can generally just be omitted. */ template - void predict(raft_proto::handle_t const& handle, + void predict(handle_t const& handle, io_t* output, io_t* input, std::size_t num_rows, - raft_proto::device_type out_mem_type, - raft_proto::device_type in_mem_type, + device_type out_mem_type, + device_type in_mem_type, infer_kind predict_type = infer_kind::default_kind, std::optional specified_chunk_size = std::nullopt) { int current_device_id; - if (out_mem_type == raft_proto::device_type::gpu || - in_mem_type == raft_proto::device_type::gpu) { + if (out_mem_type == device_type::gpu || in_mem_type == device_type::gpu) { #ifdef NVFOREST_ENABLE_GPU - raft_proto::cuda_check(cudaGetDevice(¤t_device_id)); + detail::cuda_check(cudaGetDevice(¤t_device_id)); #else - throw raft_proto::gpu_unsupported("Tried to use GPU memory in CPU-only build"); + throw detail::gpu_unsupported("Tried to use GPU memory in CPU-only build"); #endif } else { current_device_id = -1; } - auto out_buffer = - raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id}; - auto in_buffer = - raft_proto::buffer{input, num_rows * num_features(), in_mem_type, current_device_id}; + auto out_buffer = buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id}; + auto in_buffer = buffer{input, num_rows * num_features(), in_mem_type, current_device_id}; predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size); } diff --git a/cpp/include/nvforest/detail/raft_proto/handle.hpp b/cpp/include/nvforest/handle.hpp similarity index 80% rename from cpp/include/nvforest/detail/raft_proto/handle.hpp rename to cpp/include/nvforest/handle.hpp index 086d61b..861f09b 100644 --- a/cpp/include/nvforest/detail/raft_proto/handle.hpp +++ b/cpp/include/nvforest/handle.hpp @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include +#include #include #include @@ -11,14 +11,14 @@ #include #endif -namespace raft_proto { +namespace nvforest { #ifdef NVFOREST_ENABLE_GPU struct handle_t { handle_t(raft::handle_t const* handle_ptr = nullptr) : raft_handle_{handle_ptr} {} handle_t(raft::handle_t const& raft_handle) : raft_handle_{&raft_handle} {} auto get_next_usable_stream() const { - return raft_proto::cuda_stream{raft_handle_->get_next_usable_stream().value()}; + return cuda_stream{raft_handle_->get_next_usable_stream().value()}; } auto get_stream_pool_size() const { return raft_handle_->get_stream_pool_size(); } auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); } @@ -34,10 +34,10 @@ struct handle_t { }; #else struct handle_t { - auto get_next_usable_stream() const { return raft_proto::cuda_stream{}; } + auto get_next_usable_stream() const { return cuda_stream{}; } auto get_stream_pool_size() const { return std::size_t{}; } auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); } void synchronize() const {} }; #endif -} // namespace raft_proto +} // namespace nvforest diff --git a/cpp/include/nvforest/treelite_importer.hpp b/cpp/include/nvforest/treelite_importer.hpp index de4d06b..927f8e5 100644 --- a/cpp/include/nvforest/treelite_importer.hpp +++ b/cpp/include/nvforest/treelite_importer.hpp @@ -4,11 +4,13 @@ */ #pragma once #include +#include #include #include #include #include #include +#include #include #include #include @@ -245,10 +247,10 @@ struct treelite_importer { index_type num_feature, index_type max_num_categories, std::vector const& offsets, - index_type align_bytes = index_type{}, - raft_proto::device_type mem_type = raft_proto::device_type::cpu, - int device = 0, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}) + index_type align_bytes = index_type{}, + nvforest::device_type mem_type = nvforest::device_type::cpu, + int device = 0, + nvforest::cuda_stream stream = nvforest::cuda_stream{}) { auto result = decision_forest_variant{}; if constexpr (variant_index != std::variant_size_v) { @@ -362,9 +364,9 @@ struct treelite_importer { forest_model import(treelite::Model const& tl_model, index_type align_bytes = index_type{}, std::optional use_double_precision = std::nullopt, - raft_proto::device_type dev_type = raft_proto::device_type::cpu, + nvforest::device_type dev_type = nvforest::device_type::cpu, int device = 0, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}) + nvforest::cuda_stream stream = nvforest::cuda_stream{}) { validate_model_shape(tl_model); @@ -460,14 +462,13 @@ struct treelite_importer { * @param stream The CUDA stream to use for loading this model (can be * omitted for CPU). */ -inline auto import_from_treelite_model( - treelite::Model const& tl_model, - tree_layout layout = preferred_tree_layout, - index_type align_bytes = index_type{}, - std::optional use_double_precision = std::nullopt, - raft_proto::device_type dev_type = raft_proto::device_type::cpu, - int device = 0, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}) +inline auto import_from_treelite_model(treelite::Model const& tl_model, + tree_layout layout = preferred_tree_layout, + index_type align_bytes = index_type{}, + std::optional use_double_precision = std::nullopt, + nvforest::device_type dev_type = nvforest::device_type::cpu, + int device = 0, + nvforest::cuda_stream stream = nvforest::cuda_stream{}) { auto result = forest_model{}; switch (layout) { @@ -511,14 +512,13 @@ inline auto import_from_treelite_model( * @param stream The CUDA stream to use for loading this model (can be * omitted for CPU). */ -inline auto import_from_treelite_handle( - TreeliteModelHandle tl_handle, - tree_layout layout = preferred_tree_layout, - index_type align_bytes = index_type{}, - std::optional use_double_precision = std::nullopt, - raft_proto::device_type dev_type = raft_proto::device_type::cpu, - int device = 0, - raft_proto::cuda_stream stream = raft_proto::cuda_stream{}) +inline auto import_from_treelite_handle(TreeliteModelHandle tl_handle, + tree_layout layout = preferred_tree_layout, + index_type align_bytes = index_type{}, + std::optional use_double_precision = std::nullopt, + nvforest::device_type dev_type = nvforest::device_type::cpu, + int device = 0, + nvforest::cuda_stream stream = nvforest::cuda_stream{}) { return import_from_treelite_model(*static_cast(tl_handle), layout, diff --git a/cpp/src/infer0.cpp b/cpp/src/infer0.cpp index e77ba2f..81d59c3 100644 --- a/cpp/src/infer0.cpp +++ b/cpp/src/infer0.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 0) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 0) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer0.cu b/cpp/src/infer0.cu index 16f8f35..ff0b117 100644 --- a/cpp/src/infer0.cu +++ b/cpp/src/infer0.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 0) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 0) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 0) diff --git a/cpp/src/infer1.cpp b/cpp/src/infer1.cpp index a5c1be9..cb83347 100644 --- a/cpp/src/infer1.cpp +++ b/cpp/src/infer1.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 1) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 1) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer1.cu b/cpp/src/infer1.cu index bd24cb8..43ebbcb 100644 --- a/cpp/src/infer1.cu +++ b/cpp/src/infer1.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 1) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 1) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 1) diff --git a/cpp/src/infer10.cpp b/cpp/src/infer10.cpp index 1f3b000..d7ee644 100644 --- a/cpp/src/infer10.cpp +++ b/cpp/src/infer10.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 10) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 10) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer10.cu b/cpp/src/infer10.cu index fae886d..f91864a 100644 --- a/cpp/src/infer10.cu +++ b/cpp/src/infer10.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 10) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 10) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 10) diff --git a/cpp/src/infer11.cpp b/cpp/src/infer11.cpp index 88df365..8cfcbae 100644 --- a/cpp/src/infer11.cpp +++ b/cpp/src/infer11.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 11) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 11) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer11.cu b/cpp/src/infer11.cu index e74034d..22450a2 100644 --- a/cpp/src/infer11.cu +++ b/cpp/src/infer11.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 11) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 11) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 11) diff --git a/cpp/src/infer2.cpp b/cpp/src/infer2.cpp index 2b44e54..bd47291 100644 --- a/cpp/src/infer2.cpp +++ b/cpp/src/infer2.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 2) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 2) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer2.cu b/cpp/src/infer2.cu index e232010..acbcda0 100644 --- a/cpp/src/infer2.cu +++ b/cpp/src/infer2.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 2) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 2) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 2) diff --git a/cpp/src/infer3.cpp b/cpp/src/infer3.cpp index 958dde8..59c0130 100644 --- a/cpp/src/infer3.cpp +++ b/cpp/src/infer3.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 3) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 3) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer3.cu b/cpp/src/infer3.cu index d70481e..9f3e49e 100644 --- a/cpp/src/infer3.cu +++ b/cpp/src/infer3.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 3) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 3) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 3) diff --git a/cpp/src/infer4.cpp b/cpp/src/infer4.cpp index 88a15c2..d8f500e 100644 --- a/cpp/src/infer4.cpp +++ b/cpp/src/infer4.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 4) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 4) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer4.cu b/cpp/src/infer4.cu index 7c0e985..74509a4 100644 --- a/cpp/src/infer4.cu +++ b/cpp/src/infer4.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 4) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 4) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 4) diff --git a/cpp/src/infer5.cpp b/cpp/src/infer5.cpp index 09105dc..fed7c30 100644 --- a/cpp/src/infer5.cpp +++ b/cpp/src/infer5.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 5) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 5) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer5.cu b/cpp/src/infer5.cu index e243630..c098803 100644 --- a/cpp/src/infer5.cu +++ b/cpp/src/infer5.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 5) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 5) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 5) diff --git a/cpp/src/infer6.cpp b/cpp/src/infer6.cpp index 5bebe27..4ad13b5 100644 --- a/cpp/src/infer6.cpp +++ b/cpp/src/infer6.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 6) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 6) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer6.cu b/cpp/src/infer6.cu index b5face9..8d1d1db 100644 --- a/cpp/src/infer6.cu +++ b/cpp/src/infer6.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 6) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 6) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 6) diff --git a/cpp/src/infer7.cpp b/cpp/src/infer7.cpp index e8f3e5b..8bfd2d9 100644 --- a/cpp/src/infer7.cpp +++ b/cpp/src/infer7.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 7) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 7) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer7.cu b/cpp/src/infer7.cu index 4104c82..c722005 100644 --- a/cpp/src/infer7.cu +++ b/cpp/src/infer7.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 7) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 7) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 7) diff --git a/cpp/src/infer8.cpp b/cpp/src/infer8.cpp index 3597baf..c190556 100644 --- a/cpp/src/infer8.cpp +++ b/cpp/src/infer8.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 8) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 8) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer8.cu b/cpp/src/infer8.cu index 2278699..bad5de4 100644 --- a/cpp/src/infer8.cu +++ b/cpp/src/infer8.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 8) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 8) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 8) diff --git a/cpp/src/infer9.cpp b/cpp/src/infer9.cpp index 9f84457..7bd9234 100644 --- a/cpp/src/infer9.cpp +++ b/cpp/src/infer9.cpp @@ -5,5 +5,5 @@ #include #include namespace nvforest::detail::inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 9) +NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 9) } // namespace nvforest::detail::inference diff --git a/cpp/src/infer9.cu b/cpp/src/infer9.cu index 052b5db..4a26bdf 100644 --- a/cpp/src/infer9.cu +++ b/cpp/src/infer9.cu @@ -8,7 +8,7 @@ #include namespace nvforest::detail { namespace inference { -NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 9) +NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 9) } // namespace inference namespace device_initialization { NVFOREST_INITIALIZE_DEVICE(template, 9) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4430c46..0595f8a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -80,8 +80,8 @@ endfunction() # ###################################################################################################################### # * Build ml_test executable ------------------------------------------------- -ConfigureTest(NAME HOST_BUFFER_TEST raft_proto/buffer.cpp) -ConfigureTest(NAME DEVICE_BUFFER_TEST raft_proto/buffer.cu) +ConfigureTest(NAME HOST_BUFFER_TEST buffer/buffer.cpp) +ConfigureTest(NAME DEVICE_BUFFER_TEST buffer/buffer.cu) ConfigureTest(NAME FOREST_TRAVERSAL_TEST forest/traversal_forest.cpp) ConfigureTest(NAME TREELITE_TRAVERSAL_TEST forest/treelite_traversal.cpp) ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cpp treelite_importer_invalid_inputs.cpp diff --git a/cpp/tests/raft_proto/buffer.cpp b/cpp/tests/buffer/buffer.cpp similarity index 92% rename from cpp/tests/raft_proto/buffer.cpp rename to cpp/tests/buffer/buffer.cpp index 6f6e940..7bf264b 100644 --- a/cpp/tests/raft_proto/buffer.cpp +++ b/cpp/tests/buffer/buffer.cpp @@ -3,16 +3,16 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include #include -namespace raft_proto { +namespace nvforest { TEST(Buffer, default_buffer) { @@ -202,10 +202,10 @@ TEST(Buffer, copy_buffer) test_dev_buffers.emplace_back(orig_buffer, device_type::gpu, 0, cuda_stream{}); for (auto& dev_buf : test_dev_buffers) { data_out = std::vector(data.size()); - cuda_check(cudaMemcpy(static_cast(data_out.data()), - static_cast(dev_buf.data()), - dev_buf.size() * sizeof(int), - cudaMemcpyDefault)); + detail::cuda_check(cudaMemcpy(static_cast(data_out.data()), + static_cast(dev_buf.data()), + dev_buf.size() * sizeof(int), + cudaMemcpyDefault)); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); auto test_dev_copies = std::vector>{}; @@ -214,10 +214,10 @@ TEST(Buffer, copy_buffer) test_dev_copies.emplace_back(dev_buf, device_type::gpu, 0, cuda_stream{}); for (auto& copy_buf : test_dev_copies) { data_out = std::vector(data.size()); - cuda_check(cudaMemcpy(static_cast(data_out.data()), - static_cast(copy_buf.data()), - copy_buf.size() * sizeof(int), - cudaMemcpyDefault)); + detail::cuda_check(cudaMemcpy(static_cast(data_out.data()), + static_cast(copy_buf.data()), + copy_buf.size() * sizeof(int), + cudaMemcpyDefault)); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } @@ -268,10 +268,10 @@ TEST(Buffer, move_buffer) ASSERT_NE(buf.data(), data.data()); auto data_out = std::vector(buf.size()); - cuda_check(cudaMemcpy(static_cast(data_out.data()), - static_cast(buf.data()), - buf.size() * sizeof(int), - cudaMemcpyDefault)); + detail::cuda_check(cudaMemcpy(static_cast(data_out.data()), + static_cast(buf.data()), + buf.size() * sizeof(int), + cudaMemcpyDefault)); EXPECT_THAT(data_out, ::testing::ElementsAreArray(data)); } #endif @@ -306,7 +306,7 @@ TEST(Buffer, partial_buffer_copy) auto buf2 = buffer{data2.data(), data2.size(), device_type::cpu}; copy(buf2, buf1, 1, 2, 3, cuda_stream{}); copy(buf2, buf1, 1, 2, 3, cuda_stream{}); - EXPECT_THROW(copy(buf2, buf1, 1, 2, 4, cuda_stream{}), out_of_bounds); + EXPECT_THROW(copy(buf2, buf1, 1, 2, 4, cuda_stream{}), detail::out_of_bounds); } TEST(Buffer, buffer_copy_overloads) @@ -360,4 +360,4 @@ TEST(Buffer, buffer_copy_overloads) #endif } -} // namespace raft_proto +} // namespace nvforest diff --git a/cpp/tests/raft_proto/buffer.cu b/cpp/tests/buffer/buffer.cu similarity index 79% rename from cpp/tests/raft_proto/buffer.cu rename to cpp/tests/buffer/buffer.cu index 001b5f2..a114724 100644 --- a/cpp/tests/raft_proto/buffer.cu +++ b/cpp/tests/buffer/buffer.cu @@ -3,11 +3,11 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include -#include -#include -#include +#include +#include +#include #include +#include #include @@ -16,7 +16,7 @@ #include -namespace raft_proto { +namespace nvforest { NVFOREST_KERNEL void check_buffer_access(int* buf) { @@ -39,4 +39,4 @@ TEST(Buffer, device_buffer_access) EXPECT_THAT(data_out, testing::ElementsAreArray(expected)); } -} // namespace raft_proto +} // namespace nvforest diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp index 528aebb..5a67bf6 100644 --- a/cpp/tests/treelite_importer.cpp +++ b/cpp/tests/treelite_importer.cpp @@ -3,8 +3,8 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include -#include +#include +#include #include #include #include @@ -255,7 +255,7 @@ TEST(TreeliteImporter, depth_first) ASSERT_FALSE(nvforest_model.has_vector_leaves()); ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable); ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable); - ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu); + ASSERT_EQ(nvforest_model.memory_type(), nvforest::device_type::cpu); ASSERT_EQ(nvforest_model.device_index(), -1); ASSERT_FALSE(nvforest_model.is_double_precision()); } @@ -269,7 +269,7 @@ TEST(TreeliteImporter, breadth_first) ASSERT_FALSE(nvforest_model.has_vector_leaves()); ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable); ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable); - ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu); + ASSERT_EQ(nvforest_model.memory_type(), nvforest::device_type::cpu); ASSERT_EQ(nvforest_model.device_index(), -1); ASSERT_FALSE(nvforest_model.is_double_precision()); } @@ -284,7 +284,7 @@ TEST(TreeliteImporter, layered_children_together) ASSERT_FALSE(nvforest_model.has_vector_leaves()); ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable); ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable); - ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu); + ASSERT_EQ(nvforest_model.memory_type(), nvforest::device_type::cpu); ASSERT_EQ(nvforest_model.device_index(), -1); ASSERT_FALSE(nvforest_model.is_double_precision()); } @@ -384,9 +384,9 @@ TEST(TreeliteImporter, DegenerateTree) #ifdef NVFOREST_ENABLE_GPU auto raft_handle = raft::handle_t{}; - auto handle = raft_proto::handle_t{raft_handle}; + auto handle = nvforest::handle_t{raft_handle}; #else - auto handle = raft_proto::handle_t{}; + auto handle = nvforest::handle_t{}; #endif auto X = std::vector{0.0}; auto preds = std::vector(1, 0.0); @@ -395,8 +395,8 @@ TEST(TreeliteImporter, DegenerateTree) preds.data(), X.data(), 1, - raft_proto::device_type::cpu, - raft_proto::device_type::cpu, + nvforest::device_type::cpu, + nvforest::device_type::cpu, nvforest::infer_kind::default_kind, 1); ASSERT_EQ(preds, expected_preds); @@ -410,9 +410,9 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) #ifdef NVFOREST_ENABLE_GPU auto raft_handle = raft::handle_t{}; - auto handle = raft_proto::handle_t{raft_handle}; + auto handle = nvforest::handle_t{raft_handle}; #else - auto handle = raft_proto::handle_t{}; + auto handle = nvforest::handle_t{}; #endif auto X = std::vector{0.0}; auto preds = std::vector(2, 0.0); @@ -421,8 +421,8 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf) preds.data(), X.data(), 1, - raft_proto::device_type::cpu, - raft_proto::device_type::cpu, + nvforest::device_type::cpu, + nvforest::device_type::cpu, nvforest::infer_kind::default_kind, 1); ASSERT_EQ(preds, expected_preds); diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 8500861..2a3519f 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -151,7 +151,7 @@ nvForest provides a CMake config file so that other C++ projects can find and us find_package(nvforest CONFIG REQUIRED) - target_link_libraries(my_target PRIVATE nvforest::nvforest++ treelite::treelite) + target_link_libraries(my_target PRIVATE nvforest::nvforest treelite::treelite) To ensure that CMake can locate nvForest and Treelite, we recommend the use of Conda to install nvForest. @@ -179,9 +179,9 @@ Once the tree model is available as a Treelite object, pass it to the .. code-block:: cpp #include + #include #include #include - #include #include auto fm = nvforest::import_from_treelite_model( @@ -189,22 +189,22 @@ Once the tree model is available as a Treelite object, pass it to the nvforest::preferred_tree_layout, nvforest::index_type{}, std::nullopt, - raft_proto::device_type::gpu); + nvforest::device_type::gpu); Now that the tree model is fully imported into nvForest, let's run inference: .. code-block:: cpp #include - #include + #include raft::handle_t raft_handle{}; - raft_proto::handle_t handle{raft_handle}; + nvforest::handle_t handle{raft_handle}; // Assumption: // * Both output and input are in the GPU memory. // * The input buffer should be of dimension (num_rows, num_features) // * The output buffer should be of dimension (num_rows, fm.num_outputs()) fm.predict(handle, output, input, num_rows, - raft_proto::device_type::gpu, raft_proto::device_type::gpu, + nvforest::device_type::gpu, nvforest::device_type::gpu, nvforest::infer_kind::default_kind); diff --git a/python/libnvforest/CMakeLists.txt b/python/libnvforest/CMakeLists.txt index 37eb6c1..e360602 100644 --- a/python/libnvforest/CMakeLists.txt +++ b/python/libnvforest/CMakeLists.txt @@ -30,7 +30,7 @@ unset(nvforest_FOUND) # --- nvForest --- # set(BUILD_NVFOREST_TESTS OFF) -set(NVFOREST_CPP_TARGET "nvforest++") +set(NVFOREST_CPP_TARGET "nvforest") set(NVFOREST_CPP_SRC "../../cpp") # --- raft --- # diff --git a/python/libnvforest/libnvforest/load.py b/python/libnvforest/libnvforest/load.py index c8fe52e..08f4a77 100644 --- a/python/libnvforest/libnvforest/load.py +++ b/python/libnvforest/libnvforest/load.py @@ -34,7 +34,7 @@ def _load_wheel_installation(soname: str): def load_library(): - """Dynamically load libnvforest++.so and its dependencies""" + """Dynamically load libnvforest.so and its dependencies""" try: # These libraries must all be loaded before libnvforest import libraft @@ -55,7 +55,7 @@ def load_library(): != "false" ) - soname = "libnvforest++.so" + soname = "libnvforest.so" libnvforest_lib = None if prefer_system_installation: # Prefer a system library if one is present to diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt index 24333a1..90dac88 100644 --- a/python/nvforest/CMakeLists.txt +++ b/python/nvforest/CMakeLists.txt @@ -24,7 +24,7 @@ project( # * User Options -------------------------------------------------------------- option(USE_LIBNVFOREST_WHEEL "Use libnvforest wheel to provide some dependencies" OFF) -set(NVFOREST_CPP_TARGET "nvforest++") +set(NVFOREST_CPP_TARGET "nvforest") set(NVFOREST_CPP_SRC "../../cpp") # ###################################################################################################################### diff --git a/python/nvforest/nvforest/detail/raft_proto/cuda_stream.pxd b/python/nvforest/nvforest/detail/cuda_stream.pxd similarity index 63% rename from python/nvforest/nvforest/detail/raft_proto/cuda_stream.pxd rename to python/nvforest/nvforest/detail/cuda_stream.pxd index 899fed4..d12bf05 100644 --- a/python/nvforest/nvforest/detail/raft_proto/cuda_stream.pxd +++ b/python/nvforest/nvforest/detail/cuda_stream.pxd @@ -2,6 +2,6 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # -cdef extern from "nvforest/detail/raft_proto/cuda_stream.hpp" namespace "raft_proto" nogil: +cdef extern from "nvforest/cuda_stream.hpp" namespace "nvforest" nogil: cdef cppclass cuda_stream: pass diff --git a/python/nvforest/nvforest/detail/device_type.pxd b/python/nvforest/nvforest/detail/device_type.pxd new file mode 100644 index 0000000..ea89f3a --- /dev/null +++ b/python/nvforest/nvforest/detail/device_type.pxd @@ -0,0 +1,8 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# +cdef extern from "nvforest/device_type.hpp" namespace "nvforest" nogil: + cdef enum device_type: + cpu "nvforest::device_type::cpu", + gpu "nvforest::device_type::gpu" diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx index 0340c59..ce15c5b 100644 --- a/python/nvforest/nvforest/detail/forest_inference.pyx +++ b/python/nvforest/nvforest/detail/forest_inference.pyx @@ -14,18 +14,14 @@ from nvforest.detail.treelite import safe_treelite_call from libc.stdint cimport uint32_t, uintptr_t from libcpp cimport bool +from libcpp.optional cimport nullopt, optional from pylibraft.common.handle cimport handle_t as raft_handle_t +from nvforest.detail.cuda_stream cimport cuda_stream as nvforest_stream_t +from nvforest.detail.device_type cimport device_type as nvforest_device_t +from nvforest.detail.handle cimport handle_t as nvforest_handle_t from nvforest.detail.infer_kind cimport infer_kind from nvforest.detail.postprocessing cimport element_op, row_op -from nvforest.detail.raft_proto.cuda_stream cimport ( - cuda_stream as raft_proto_stream_t, -) -from nvforest.detail.raft_proto.device_type cimport ( - device_type as raft_proto_device_t, -) -from nvforest.detail.raft_proto.handle cimport handle_t as raft_proto_handle_t -from nvforest.detail.raft_proto.optional cimport nullopt, optional from nvforest.detail.tree_layout cimport tree_layout as nvforest_tree_layout from nvforest.detail.treelite cimport ( TreeliteDeserializeModelFromBytes, @@ -37,12 +33,12 @@ from nvforest.detail.treelite cimport ( cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil: cdef cppclass forest_model: void predict[io_t]( - const raft_proto_handle_t&, + const nvforest_handle_t&, io_t*, io_t*, size_t, - raft_proto_device_t, - raft_proto_device_t, + nvforest_device_t, + nvforest_device_t, infer_kind, optional[uint32_t] ) except + @@ -61,15 +57,15 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil: nvforest_tree_layout, uint32_t, optional[bool], - raft_proto_device_t, + nvforest_device_t, int, - raft_proto_stream_t + nvforest_stream_t ) except + cdef class ForestInference_impl(): cdef forest_model model - cdef raft_proto_handle_t raft_proto_handle + cdef nvforest_handle_t nvforest_handle cdef object raft_handle cdef object device @@ -84,10 +80,10 @@ cdef class ForestInference_impl(): device: str = "cpu", device_id: Optional[int] = None, ): - # Store reference to RAFT handle to control lifetime, since raft_proto - # handle keeps a pointer to it + # Store reference to RAFT handle to control lifetime, since + # nvforest_handle keeps a pointer to it self.raft_handle = raft_handle - self.raft_proto_handle = raft_proto_handle_t( + self.nvforest_handle = nvforest_handle_t( self.raft_handle.getHandle() ) @@ -106,8 +102,8 @@ cdef class ForestInference_impl(): "Failed to load Treelite model from bytes:" ) - cdef raft_proto_device_t dev_type - dev_type = raft_proto_device_t.gpu if device == "gpu" else raft_proto_device_t.cpu + cdef nvforest_device_t dev_type + dev_type = nvforest_device_t.gpu if device == "gpu" else nvforest_device_t.cpu cdef nvforest_tree_layout tree_layout if layout.lower() == "depth_first": tree_layout = nvforest_tree_layout.depth_first @@ -134,7 +130,7 @@ cdef class ForestInference_impl(): use_double_precision_c, dev_type, device_id, - self.raft_proto_handle.get_next_usable_stream() + self.nvforest_handle.get_next_usable_stream() ) safe_treelite_call( @@ -188,9 +184,9 @@ cdef class ForestInference_impl(): chunk_size: Optional[int] = None, ) -> DataType: cdef uintptr_t in_ptr - cdef raft_proto_device_t in_dev + cdef nvforest_device_t in_dev cdef uintptr_t out_ptr - cdef raft_proto_device_t out_dev + cdef nvforest_device_t out_dev cdef infer_kind infer_type_enum cdef optional[uint32_t] chunk_specification @@ -220,9 +216,9 @@ cdef class ForestInference_impl(): order="C", ) in_ptr = X.__array_interface__["data"][0] - in_dev = raft_proto_device_t.cpu + in_dev = nvforest_device_t.cpu out_ptr = preds.__array_interface__["data"][0] - out_dev = raft_proto_device_t.cpu + out_dev = nvforest_device_t.cpu else: assert self.device == "gpu" import cupy as cp @@ -233,9 +229,9 @@ cdef class ForestInference_impl(): order="C", ) in_ptr = X.__cuda_array_interface__["data"][0] - in_dev = raft_proto_device_t.gpu + in_dev = nvforest_device_t.gpu out_ptr = preds.__cuda_array_interface__["data"][0] - out_dev = raft_proto_device_t.gpu + out_dev = nvforest_device_t.gpu if chunk_size is None: chunk_specification = nullopt @@ -244,7 +240,7 @@ cdef class ForestInference_impl(): if model_dtype == np.float32: self.model.predict[float]( - self.raft_proto_handle, + self.nvforest_handle, out_ptr, in_ptr, n_rows, @@ -255,7 +251,7 @@ cdef class ForestInference_impl(): ) else: self.model.predict[double]( - self.raft_proto_handle, + self.nvforest_handle, out_ptr, in_ptr, n_rows, @@ -266,7 +262,7 @@ cdef class ForestInference_impl(): ) if self.device == "gpu": - self.raft_proto_handle.synchronize() + self.nvforest_handle.synchronize() return preds diff --git a/python/nvforest/nvforest/detail/raft_proto/handle.pxd b/python/nvforest/nvforest/detail/handle.pxd similarity index 61% rename from python/nvforest/nvforest/detail/raft_proto/handle.pxd rename to python/nvforest/nvforest/detail/handle.pxd index 5ad107e..41c16a8 100644 --- a/python/nvforest/nvforest/detail/raft_proto/handle.pxd +++ b/python/nvforest/nvforest/detail/handle.pxd @@ -5,15 +5,13 @@ from pylibraft.common.handle cimport handle_t as raft_handle_t -from nvforest.detail.raft_proto.cuda_stream cimport ( - cuda_stream as raft_proto_stream_t, -) +from nvforest.detail.cuda_stream cimport cuda_stream as nvforest_stream_t -cdef extern from "nvforest/detail/raft_proto/handle.hpp" namespace "raft_proto" nogil: +cdef extern from "nvforest/handle.hpp" namespace "nvforest" nogil: cdef cppclass handle_t: handle_t() except + handle_t(const raft_handle_t* handle_ptr) except + handle_t(const raft_handle_t& handle) except + - raft_proto_stream_t get_next_usable_stream() except + + nvforest_stream_t get_next_usable_stream() except + void synchronize() except+ diff --git a/python/nvforest/nvforest/detail/raft_proto/__init__.py b/python/nvforest/nvforest/detail/raft_proto/__init__.py deleted file mode 100644 index b25a9d5..0000000 --- a/python/nvforest/nvforest/detail/raft_proto/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# diff --git a/python/nvforest/nvforest/detail/raft_proto/device_type.pxd b/python/nvforest/nvforest/detail/raft_proto/device_type.pxd deleted file mode 100644 index dc6214c..0000000 --- a/python/nvforest/nvforest/detail/raft_proto/device_type.pxd +++ /dev/null @@ -1,8 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# -cdef extern from "nvforest/detail/raft_proto/device_type.hpp" namespace "raft_proto" nogil: - cdef enum device_type: - cpu "raft_proto::device_type::cpu", - gpu "raft_proto::device_type::gpu" diff --git a/python/nvforest/nvforest/detail/raft_proto/optional.pxd b/python/nvforest/nvforest/detail/raft_proto/optional.pxd deleted file mode 100644 index 54dcac4..0000000 --- a/python/nvforest/nvforest/detail/raft_proto/optional.pxd +++ /dev/null @@ -1,43 +0,0 @@ -# -# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. -# SPDX-License-Identifier: Apache-2.0 -# -# The following is taken from -# https://github.com/cython/cython/blob/master/Cython/Includes/libcpp/optional.pxd, -# which provides a binding for std::optional in Cython 3.0 - -from libcpp cimport bool - - -cdef extern from "" namespace "std" nogil: - cdef cppclass nullopt_t: - nullopt_t() - - cdef nullopt_t nullopt - - cdef cppclass optional[T]: - ctypedef T value_type - optional() - optional(nullopt_t) - optional(optional&) except + - optional(T&) except + - bool has_value() - T& value() - T& value_or[U](U& default_value) - void swap(optional&) - void reset() - T& emplace(...) - T& operator*() - # T* operator->() # Not Supported - optional& operator=(optional&) - optional& operator=[U](U&) - bool operator bool() - bool operator!() - bool operator==[U](optional&, U&) - bool operator!=[U](optional&, U&) - bool operator<[U](optional&, U&) - bool operator>[U](optional&, U&) - bool operator<=[U](optional&, U&) - bool operator>=[U](optional&, U&) - - optional[T] make_optional[T](...) except +