diff --git a/build.sh b/build.sh
index 541c784..6497744 100755
--- a/build.sh
+++ b/build.sh
@@ -291,9 +291,9 @@ if (! hasArg --configure-only) && (completeBuild || hasArg libnvforest); then
fi
fi
MSG="${MSG}
parallel setting: $PARALLEL_LEVEL"
- if [[ -f "${LIBNVFOREST_BUILD_DIR}/libnvforest++.so" ]]; then
- LIBNVFOREST_FS=$(find "${LIBNVFOREST_BUILD_DIR}" -name libnvforest++.so -printf '%s'| awk '{printf "%.2f MB", $1/1024/1024}')
- MSG="${MSG}
libnvforest++.so size: $LIBNVFOREST_FS"
+ if [[ -f "${LIBNVFOREST_BUILD_DIR}/libnvforest.so" ]]; then
+ LIBNVFOREST_FS=$(find "${LIBNVFOREST_BUILD_DIR}" -name libnvforest.so -printf '%s'| awk '{printf "%.2f MB", $1/1024/1024}')
+ MSG="${MSG}
libnvforest.so size: $LIBNVFOREST_FS"
fi
BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIBNVFOREST_BUILD_DIR}"}
echo "The HTML report can be found at [${BMR_DIR}/ninja_log.html]. In CI, this report"
diff --git a/ci/build_wheel_nvforest.sh b/ci/build_wheel_nvforest.sh
index 7960ce2..ce96a70 100755
--- a/ci/build_wheel_nvforest.sh
+++ b/ci/build_wheel_nvforest.sh
@@ -20,7 +20,7 @@ LIBNVFOREST_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libnvforest_${RAPIDS_PY_CUDA_SUFF
echo "libnvforest-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo "${LIBNVFOREST_WHEELHOUSE}"/libnvforest_*.whl)" >> "${PIP_CONSTRAINT}"
EXCLUDE_ARGS=(
- --exclude "libnvforest++.so"
+ --exclude "libnvforest.so"
--exclude "libraft.so"
--exclude "libcublas.so.*"
--exclude "libcublasLt.so.*"
diff --git a/conda/recipes/libnvforest/recipe.yaml b/conda/recipes/libnvforest/recipe.yaml
index 3495392..26ca5b9 100644
--- a/conda/recipes/libnvforest/recipe.yaml
+++ b/conda/recipes/libnvforest/recipe.yaml
@@ -91,7 +91,7 @@ outputs:
prefix_detection:
ignore:
# See https://github.com/rapidsai/build-planning/issues/160
- - lib/libnvforest++.so
+ - lib/libnvforest.so
string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
requirements:
build:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fb9d015..8dbaa15 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -60,7 +60,7 @@ option(NVTX "Enable nvtx markers" OFF)
option(USE_CCACHE "Cache build artifacts with ccache" OFF)
option(NVFOREST_USE_RAFT_STATIC "Build and statically link the RAFT library" OFF)
option(NVFOREST_USE_TREELITE_STATIC "Build and statically link the treelite library" OFF)
-option(NVFOREST_EXPORT_TREELITE_LINKAGE "Whether to publicly or privately link treelite to libnvforest++" OFF)
+option(NVFOREST_EXPORT_TREELITE_LINKAGE "Whether to publicly or privately link treelite to libnvforest" OFF)
option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
# The options below allow incorporating libnvforest into another build process without installing all its components.
@@ -123,7 +123,7 @@ endif()
# ######################################################################################################################
# * Target names -------------------------------------------------------------
-set(NVFOREST_CPP_TARGET "nvforest++")
+set(NVFOREST_CPP_TARGET "nvforest")
# ######################################################################################################################
# * Conda environment detection ----------------------------------------------
@@ -193,7 +193,7 @@ if(BUILD_NVFOREST_TESTS)
endif()
# ######################################################################################################################
-# * build libnvforest++ shared library -------------------------------------------
+# * build libnvforest shared library -------------------------------------------
file(
WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
diff --git a/cpp/include/nvforest/Implementation.md b/cpp/include/nvforest/Implementation.md
index f34224b..b0bc595 100644
--- a/cpp/include/nvforest/Implementation.md
+++ b/cpp/include/nvforest/Implementation.md
@@ -6,17 +6,6 @@ does *not* require nvcc, CUDA or any other GPU-related library for its CPU-only
build, we also go over general strategies for CPU/GPU interoperability as used
by nvForest.
-**A NOTE ON THE `raft_proto` NAMESPACE:** In addition to nvForest-specific code, the new
-implementation requires some more general-purpose CPU-GPU interoperable
-utilities. Many of these utilities are either already implemented in RAFT (but
-do not provide the required CPU-interoperable compilation guarantees) or are a
-natural fit for incorporation in RAFT. In order to allow for more careful
-integration with the existing RAFT codebase and interoperability
-strategies, these utilities are currently provided in the `raft_proto`
-namespace but will be moved into RAFT over time. Other algorithms should
-not make use of the `raft_proto` namespace but instead wait until this
-transition has taken place.
-
## Design Goals
1. Provide state-of-the-art runtime performance for forest models on GPU,
especially for cases where CPU performance will not suffice (e.g. large
@@ -43,7 +32,7 @@ codebase.
It is also occasionally useful to make use of a `constexpr` value
indicating whether or not `NVFOREST_ENABLE_GPU` is set, which we introduce as
-`raft_proto::GPU_ENABLED`.
+`nvforest::detail::GPU_ENABLED`.
### Avoiding CUDA symbols in CPU-only builds
The most significant challenge of attempting to create a unified CPU/GPU
@@ -88,7 +77,7 @@ between GPU and CPU.
Where we _need_ to provide distinct logic between GPU and CPU
implementations, we do so in implementation headers. In `infer/cpu.hpp`, we
have a fully-defined template for CPU specializations of
-`detail::inference::infer`. If `raft_proto::GPU_ENABLED` is `false`, we also
+`detail::inference::infer`. If `nvforest::detail::GPU_ENABLED` is `false`, we also
include the GPU specializations, which will simply throw an exception if
invoked. In `infer/gpu.hpp` we *declare* but do not *define* the GPU
specializations. In `infer/gpu.cuh` we provide the full working definition for
@@ -158,8 +147,8 @@ a standard benchmark) on the CPU.
With some motivation for the general approach to CPU-GPU interoperability, we
now offer an overview of the layout of the codebase to help guide future
-improvements. Because `raft_proto` utilities are going to be moved to RAFT or other
-general-purpose libraries, we will not review anything within the `raft_proto`
+improvements. Because `nvforest::detail` utilities are going to be moved to RAFT or other
+general-purpose libraries, we will not review anything within the `nvforest::detail`
directory here.
### Public Headers
diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md
index 3e3df17..711ab33 100644
--- a/cpp/include/nvforest/README.md
+++ b/cpp/include/nvforest/README.md
@@ -19,13 +19,6 @@ available in the top-level include directory. The `detail` directory
contains implementation details that are not required to use nvForest and which
will certainly change over time.
-**A NOTE ON THE `raft_proto` NAMESPACE:** For the first iteration of this nvForest
-implementation, much of the more general-purpose CPU-GPU interoperable code
-has temporarily been put in the `raft_proto` namespace. As the name suggests,
-the intention is that most or all of this functionality will either be moved
-to RAFT or that RAFT features will be updated to provide CPU-GPU
-compatible versions of the same.
-
### Importing a model
nvForest uses Treelite as a common translation layer for all its input types.
To load a forest model, we first create a Treelite model handle as
@@ -50,7 +43,7 @@ auto nvforest_model = import_from_treelite_model(
tree_layout::depth_first, // layout
128u, // align_bytes
false, // use_double_precision
- raft_proto::device_type::gpu, // mem_type
+ nvforest::device_type::gpu, // mem_type
0, // device_id
stream // CUDA stream
);
@@ -74,7 +67,7 @@ serialization format will be used. Otherwise, the model will be evaluated
at double precision if this value is set to `true` or single precision if this
value is set to `false`.
-**dev_type**: This argument controls where the model will be executed. If `raft_proto::device_type::gpu`, then it will be executed on GPU. If `raft_proto::device_type::cpu`, then it will be executed on CPU.
+**dev_type**: This argument controls where the model will be executed. If `nvforest::device_type::gpu`, then it will be executed on GPU. If `nvforest::device_type::cpu`, then it will be executed on CPU.
**device_id**: This integer indicates the ID of the GPU which should be used.
If CPU is being used, this argument is ignored.
@@ -82,9 +75,9 @@ If CPU is being used, this argument is ignored.
**stream**: The CUDA stream which will be used for the actual model import.
If CPU is being used, this argument is ignored. Note that you do *not* need
CUDA headers if you are working with a CPU-only build of nvForest. This
-argument uses a `raft_proto::cuda_stream` type which evaluates to a
+argument uses a `nvforest::cuda_stream` type which evaluates to a
placeholder type in CPU-only builds. For applications which themselves want to
-implement CPU-GPU interoperable builds, the `raft_proto::cuda_stream` type can be
+implement CPU-GPU interoperable builds, the `nvforest::cuda_stream` type can be
used directly.
@@ -106,24 +99,24 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float));
// Assuming that input is a float* pointing to data already located on-device
-auto handle = raft_proto::handle_t{};
+auto handle = nvforest::handle_t{};
nvforest_model.predict(
handle,
output,
input,
num_rows,
- raft_proto::device_type::gpu, // out_mem_type
- raft_proto::device_type::gpu, // in_mem_type
+ nvforest::device_type::gpu, // out_mem_type
+ nvforest::device_type::gpu, // in_mem_type
4 // chunk_size
);
```
**handle**: To provide a unified interface on CPU and GPU, we introduce
-`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a
+`nvforest::handle_t` as a wrapper for `raft::handle_t`. This is currently just a
placeholder in CPU-only builds, and using it does not require any CUDA
functionality. For GPU-enabled builds, you can construct a
-`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use.
+`nvforest::handle_t` directly from the `raft::handle_t` you wish to use.
**output**: Pointer to pre-allocated buffer where results should be
written. If the model has been loaded at single precision, this should be a
diff --git a/cpp/include/nvforest/detail/raft_proto/buffer.hpp b/cpp/include/nvforest/buffer.hpp
similarity index 69%
rename from cpp/include/nvforest/detail/raft_proto/buffer.hpp
rename to cpp/include/nvforest/buffer.hpp
index 7490e48..deebf60 100644
--- a/cpp/include/nvforest/detail/raft_proto/buffer.hpp
+++ b/cpp/include/nvforest/buffer.hpp
@@ -3,15 +3,15 @@
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
#include
@@ -21,7 +21,8 @@
#include
#include
-namespace raft_proto {
+namespace nvforest {
+
/**
* @brief A container which may or may not own its own data on host or device
*
@@ -31,10 +32,10 @@ struct buffer {
using index_type = std::size_t;
using value_type = T;
- using data_store = std::variant,
- non_owning_buffer,
- owning_buffer,
- owning_buffer>;
+ using data_store = std::variant,
+ detail::non_owning_buffer,
+ detail::owning_buffer,
+ detail::owning_buffer>;
buffer() : device_{}, data_{}, size_{}, cached_ptr{nullptr} {}
@@ -44,19 +45,19 @@ struct buffer {
int device = 0,
cuda_stream stream = 0)
: device_{[mem_type, &device]() {
- auto result = device_id_variant{};
+ auto result = detail::device_id_variant{};
switch (mem_type) {
- case device_type::cpu: result = device_id{device}; break;
- case device_type::gpu: result = device_id{device}; break;
+ case device_type::cpu: result = detail::device_id{device}; break;
+ case device_type::gpu: result = detail::device_id{device}; break;
}
return result;
}()},
data_{[this, mem_type, size, stream]() {
auto result = data_store{};
switch (mem_type) {
- case device_type::cpu: result = owning_buffer{size}; break;
+ case device_type::cpu: result = detail::owning_buffer{size}; break;
case device_type::gpu:
- result = owning_buffer{std::get<1>(device_), size, stream};
+ result = detail::owning_buffer{std::get<1>(device_), size, stream};
break;
}
return result;
@@ -78,18 +79,22 @@ struct buffer {
/** Construct non-owning buffer */
buffer(T* input_data, index_type size, device_type mem_type = device_type::cpu, int device = 0)
: device_{[mem_type, &device]() {
- auto result = device_id_variant{};
+ auto result = detail::device_id_variant{};
switch (mem_type) {
- case device_type::cpu: result = device_id{device}; break;
- case device_type::gpu: result = device_id{device}; break;
+ case device_type::cpu: result = detail::device_id{device}; break;
+ case device_type::gpu: result = detail::device_id{device}; break;
}
return result;
}()},
data_{[input_data, mem_type]() {
auto result = data_store{};
switch (mem_type) {
- case device_type::cpu: result = non_owning_buffer{input_data}; break;
- case device_type::gpu: result = non_owning_buffer{input_data}; break;
+ case device_type::cpu:
+ result = detail::non_owning_buffer{input_data};
+ break;
+ case device_type::gpu:
+ result = detail::non_owning_buffer{input_data};
+ break;
}
return result;
}()},
@@ -118,10 +123,10 @@ struct buffer {
int device = 0,
cuda_stream stream = cuda_stream{})
: device_{[mem_type, &device]() {
- auto result = device_id_variant{};
+ auto result = detail::device_id_variant{};
switch (mem_type) {
- case device_type::cpu: result = device_id{device}; break;
- case device_type::gpu: result = device_id{device}; break;
+ case device_type::cpu: result = detail::device_id{device}; break;
+ case device_type::gpu: result = detail::device_id{device}; break;
}
return result;
}()},
@@ -129,11 +134,12 @@ struct buffer {
auto result = data_store{};
auto result_data = static_cast(nullptr);
if (mem_type == device_type::cpu) {
- auto buf = owning_buffer(other.size());
+ auto buf = detail::owning_buffer(other.size());
result_data = buf.get();
result = std::move(buf);
} else if (mem_type == device_type::gpu) {
- auto buf = owning_buffer(std::get<1>(device_), other.size(), stream);
+ auto buf =
+ detail::owning_buffer(std::get<1>(device_), other.size(), stream);
result_data = buf.get();
result = std::move(buf);
}
@@ -188,10 +194,10 @@ struct buffer {
*/
buffer(buffer&& other, device_type mem_type, int device, cuda_stream stream)
: device_{[mem_type, &device]() {
- auto result = device_id_variant{};
+ auto result = detail::device_id_variant{};
switch (mem_type) {
- case device_type::cpu: result = device_id{device}; break;
- case device_type::gpu: result = device_id{device}; break;
+ case device_type::cpu: result = detail::device_id{device}; break;
+ case device_type::gpu: result = detail::device_id{device}; break;
}
return result;
}()},
@@ -202,11 +208,11 @@ struct buffer {
} else {
auto* result_data = static_cast(nullptr);
if (mem_type == device_type::cpu) {
- auto buf = owning_buffer{other.size()};
+ auto buf = detail::owning_buffer{other.size()};
result_data = buf.get();
result = std::move(buf);
} else if (mem_type == device_type::gpu) {
- auto buf = owning_buffer{device, other.size(), stream};
+ auto buf = detail::owning_buffer{device, other.size(), stream};
result_data = buf.get();
result = std::move(buf);
}
@@ -306,23 +312,23 @@ struct buffer {
~buffer() = default;
private:
- device_id_variant device_;
+ detail::device_id_variant device_;
data_store data_;
index_type size_;
T* cached_ptr;
};
template
-const_agnostic_same_t copy(buffer& dst,
- buffer const& src,
- typename buffer::index_type dst_offset,
- typename buffer::index_type src_offset,
- typename buffer::index_type size,
- cuda_stream stream)
+detail::const_agnostic_same_t copy(buffer& dst,
+ buffer const& src,
+ typename buffer::index_type dst_offset,
+ typename buffer::index_type src_offset,
+ typename buffer::index_type size,
+ cuda_stream stream)
{
if constexpr (bounds_check) {
if (src.size() - src_offset < size || dst.size() - dst_offset < size) {
- throw out_of_bounds("Attempted copy to or from buffer of inadequate size");
+ throw detail::out_of_bounds("Attempted copy to or from buffer of inadequate size");
}
}
copy(dst.data() + dst_offset,
@@ -334,27 +340,27 @@ const_agnostic_same_t copy(buffer& dst,
}
template
-const_agnostic_same_t copy(buffer& dst, buffer const& src, cuda_stream stream)
+detail::const_agnostic_same_t copy(buffer& dst, buffer const& src, cuda_stream stream)
{
copy(dst, src, 0, 0, src.size(), stream);
}
template
-const_agnostic_same_t copy(buffer& dst, buffer const& src)
+detail::const_agnostic_same_t copy(buffer& dst, buffer const& src)
{
copy(dst, src, 0, 0, src.size(), cuda_stream{});
}
template
-const_agnostic_same_t copy(buffer&& dst,
- buffer&& src,
- typename buffer::index_type dst_offset,
- typename buffer::index_type src_offset,
- typename buffer::index_type size,
- cuda_stream stream)
+detail::const_agnostic_same_t copy(buffer&& dst,
+ buffer&& src,
+ typename buffer::index_type dst_offset,
+ typename buffer::index_type src_offset,
+ typename buffer::index_type size,
+ cuda_stream stream)
{
if constexpr (bounds_check) {
if (src.size() - src_offset < size || dst.size() - dst_offset < size) {
- throw out_of_bounds("Attempted copy to or from buffer of inadequate size");
+ throw detail::out_of_bounds("Attempted copy to or from buffer of inadequate size");
}
}
copy(dst.data() + dst_offset,
@@ -366,23 +372,23 @@ const_agnostic_same_t copy(buffer&& dst,
}
template
-const_agnostic_same_t copy(buffer&& dst,
- buffer&& src,
- typename buffer::index_type dst_offset,
- cuda_stream stream)
+detail::const_agnostic_same_t copy(buffer&& dst,
+ buffer&& src,
+ typename buffer::index_type dst_offset,
+ cuda_stream stream)
{
copy(dst, src, dst_offset, 0, src.size(), stream);
}
template
-const_agnostic_same_t copy(buffer&& dst, buffer&& src, cuda_stream stream)
+detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src, cuda_stream stream)
{
copy(dst, src, 0, 0, src.size(), stream);
}
template
-const_agnostic_same_t copy(buffer&& dst, buffer&& src)
+detail::const_agnostic_same_t copy(buffer&& dst, buffer&& src)
{
copy(dst, src, 0, 0, src.size(), cuda_stream{});
}
-} // namespace raft_proto
+} // namespace nvforest
diff --git a/cpp/include/nvforest/detail/raft_proto/cuda_stream.hpp b/cpp/include/nvforest/cuda_stream.hpp
similarity index 71%
rename from cpp/include/nvforest/detail/raft_proto/cuda_stream.hpp
rename to cpp/include/nvforest/cuda_stream.hpp
index f80c488..d231698 100644
--- a/cpp/include/nvforest/detail/raft_proto/cuda_stream.hpp
+++ b/cpp/include/nvforest/cuda_stream.hpp
@@ -7,7 +7,7 @@
#include
#endif
-namespace raft_proto {
+namespace nvforest {
#ifdef NVFOREST_ENABLE_GPU
using cuda_stream = cudaStream_t;
#else
@@ -19,4 +19,9 @@ inline void synchronize(cuda_stream stream)
cudaStreamSynchronize(stream);
#endif
}
-} // namespace raft_proto
+} // namespace nvforest
+
+namespace nvforest::detail {
+using nvforest::cuda_stream;
+using nvforest::synchronize;
+} // namespace nvforest::detail
diff --git a/cpp/include/nvforest/decision_forest.hpp b/cpp/include/nvforest/decision_forest.hpp
index e0c5a9d..6534196 100644
--- a/cpp/include/nvforest/decision_forest.hpp
+++ b/cpp/include/nvforest/decision_forest.hpp
@@ -3,15 +3,16 @@
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
+#include
#include
+#include
+#include
#include
+#include
#include
#include
#include
#include
-#include
-#include
-#include
#include
#include
#include
@@ -145,21 +146,21 @@ struct decision_forest {
* operations, including sigmoid, exponential, and
* logarithm_one_plus_exp
*/
- decision_forest(raft_proto::buffer&& nodes,
- raft_proto::buffer&& root_node_indexes,
- raft_proto::buffer&& node_id_mapping,
- raft_proto::buffer&& bias,
- index_type num_features,
- index_type num_outputs = index_type{2},
- bool has_categorical_nodes = false,
- std::optional>&& vector_output = std::nullopt,
- std::optional>&&
- categorical_storage = std::nullopt,
- index_type leaf_size = index_type{1},
- row_op row_postproc = row_op::disable,
- element_op elem_postproc = element_op::disable,
- io_type average_factor = io_type{1},
- io_type postproc_constant = io_type{1})
+ decision_forest(
+ buffer&& nodes,
+ buffer&& root_node_indexes,
+ buffer&& node_id_mapping,
+ buffer&& bias,
+ index_type num_features,
+ index_type num_outputs = index_type{2},
+ bool has_categorical_nodes = false,
+ std::optional>&& vector_output = std::nullopt,
+ std::optional>&& categorical_storage = std::nullopt,
+ index_type leaf_size = index_type{1},
+ row_op row_postproc = row_op::disable,
+ element_op elem_postproc = element_op::disable,
+ io_type average_factor = io_type{1},
+ io_type postproc_constant = io_type{1})
: nodes_{nodes},
root_node_indexes_{root_node_indexes},
node_id_mapping_{node_id_mapping},
@@ -176,11 +177,11 @@ struct decision_forest {
postproc_constant_{postproc_constant}
{
if (nodes.memory_type() != root_node_indexes.memory_type()) {
- throw raft_proto::mem_type_mismatch(
+ throw detail::mem_type_mismatch(
"Nodes and indexes of forest must both be stored on either host or device");
}
if (nodes.device_index() != root_node_indexes.device_index()) {
- throw raft_proto::mem_type_mismatch(
+ throw detail::mem_type_mismatch(
"Nodes and indexes of forest must both be stored on same device");
}
detail::initialize_device(nodes.device());
@@ -245,18 +246,18 @@ struct decision_forest {
* 1 to 32 is a valid value, and in general larger batches benefit from
* larger values.
*/
- void predict(raft_proto::buffer& output,
- raft_proto::buffer const& input,
- raft_proto::cuda_stream stream = raft_proto::cuda_stream{},
+ void predict(buffer& output,
+ buffer const& input,
+ nvforest::cuda_stream stream = nvforest::cuda_stream{},
infer_kind predict_type = infer_kind::default_kind,
std::optional specified_rows_per_block_iter = std::nullopt)
{
if (output.memory_type() != memory_type() || input.memory_type() != memory_type()) {
- throw raft_proto::wrong_device_type{
+ throw detail::wrong_device_type{
"Tried to use host I/O data with model on device or vice versa"};
}
if (output.device_index() != device_index() || input.device_index() != device_index()) {
- throw raft_proto::wrong_device{"I/O data on different device than model"};
+ throw detail::wrong_device{"I/O data on different device than model"};
}
auto* vector_output_data =
(vector_output_.has_value() ? vector_output_->data() : static_cast(nullptr));
@@ -265,54 +266,54 @@ struct decision_forest {
: static_cast(nullptr));
switch (nodes_.device().index()) {
case 0:
- nvforest::detail::infer(obj(),
- get_postprocessor(predict_type),
- output.data(),
- input.data(),
- index_type(input.size() / num_features_),
- num_features_,
- num_outputs(predict_type),
- has_categorical_nodes_,
- vector_output_data,
- categorical_storage_data,
- predict_type,
- specified_rows_per_block_iter,
- std::get<0>(nodes_.device()),
- stream);
+ detail::infer(obj(),
+ get_postprocessor(predict_type),
+ output.data(),
+ input.data(),
+ index_type(input.size() / num_features_),
+ num_features_,
+ num_outputs(predict_type),
+ has_categorical_nodes_,
+ vector_output_data,
+ categorical_storage_data,
+ predict_type,
+ specified_rows_per_block_iter,
+ std::get<0>(nodes_.device()),
+ stream);
break;
case 1:
- nvforest::detail::infer(obj(),
- get_postprocessor(predict_type),
- output.data(),
- input.data(),
- index_type(input.size() / num_features_),
- num_features_,
- num_outputs(predict_type),
- has_categorical_nodes_,
- vector_output_data,
- categorical_storage_data,
- predict_type,
- specified_rows_per_block_iter,
- std::get<1>(nodes_.device()),
- stream);
+ detail::infer(obj(),
+ get_postprocessor(predict_type),
+ output.data(),
+ input.data(),
+ index_type(input.size() / num_features_),
+ num_features_,
+ num_outputs(predict_type),
+ has_categorical_nodes_,
+ vector_output_data,
+ categorical_storage_data,
+ predict_type,
+ specified_rows_per_block_iter,
+ std::get<1>(nodes_.device()),
+ stream);
break;
}
}
private:
/** The nodes for all trees in the forest */
- raft_proto::buffer nodes_;
+ buffer nodes_;
/** The index of the root node for each tree in the forest */
- raft_proto::buffer root_node_indexes_;
+ buffer root_node_indexes_;
/** Mapping to apply to node IDs. Only relevant when predict_type == infer_kind::leaf_id */
- raft_proto::buffer node_id_mapping_;
+ buffer node_id_mapping_;
/** Bias term to apply to the output */
- raft_proto::buffer bias_;
+ buffer bias_;
/** Buffer of outputs for all leaves in vector-leaf models */
- std::optional> vector_output_;
+ std::optional> vector_output_;
/** Buffer of elements used as backing data for bitsets which specify
* categories for all categorical nodes in the model. */
- std::optional> categorical_storage_;
+ std::optional> categorical_storage_;
// Metadata
index_type num_features_;
@@ -458,7 +459,7 @@ inline auto get_forest_variant_index(bool use_double_thresholds,
// TODO(wphicks): We are overestimating categorical storage required here
auto double_indexes_required =
(max_num_categories > max_local_categories &&
- ((raft_proto::ceildiv(max_num_categories, max_local_categories) + 1 * num_categorical_nodes) >
+ ((detail::ceildiv(max_num_categories, max_local_categories) + 1 * num_categorical_nodes) >
std::numeric_limits::max())) ||
num_vector_leaves > std::numeric_limits::max();
diff --git a/cpp/include/nvforest/detail/bitset.hpp b/cpp/include/nvforest/detail/bitset.hpp
index c11cdee..c166eb3 100644
--- a/cpp/include/nvforest/detail/bitset.hpp
+++ b/cpp/include/nvforest/detail/bitset.hpp
@@ -3,8 +3,8 @@
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
-#include
-#include
+#include
+#include
#include
#include
diff --git a/cpp/include/nvforest/detail/raft_proto/ceildiv.hpp b/cpp/include/nvforest/detail/ceildiv.hpp
similarity index 79%
rename from cpp/include/nvforest/detail/raft_proto/ceildiv.hpp
rename to cpp/include/nvforest/detail/ceildiv.hpp
index 8f4fb6c..4d6245b 100644
--- a/cpp/include/nvforest/detail/raft_proto/ceildiv.hpp
+++ b/cpp/include/nvforest/detail/ceildiv.hpp
@@ -3,15 +3,15 @@
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
-#include
+#include
#include
-namespace raft_proto {
+namespace nvforest::detail {
template
HOST DEVICE auto constexpr ceildiv(T dividend, U divisor)
{
static_assert(std::is_integral_v && std::is_integral_v, "Arguments must be integers");
return dividend / divisor + (dividend % divisor != 0);
}
-} // namespace raft_proto
+} // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/const_agnostic.hpp b/cpp/include/nvforest/detail/const_agnostic.hpp
new file mode 100644
index 0000000..25d8a45
--- /dev/null
+++ b/cpp/include/nvforest/detail/const_agnostic.hpp
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include
+
+namespace nvforest::detail {
+template
+using const_agnostic_same_t =
+ std::enable_if_t, std::remove_const_t>, V>;
+
+template
+inline constexpr auto const_agnostic_same_v =
+ std::is_same_v, std::remove_const_t>;
+} // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/copy.hpp b/cpp/include/nvforest/detail/copy.hpp
similarity index 58%
rename from cpp/include/nvforest/detail/raft_proto/detail/copy.hpp
rename to cpp/include/nvforest/detail/copy.hpp
index fffd3ac..b02a996 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/copy.hpp
+++ b/cpp/include/nvforest/detail/copy.hpp
@@ -3,39 +3,33 @@
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
-#include
-#include
+#include
+#include
#include
#ifdef NVFOREST_ENABLE_GPU
-#include
+#include
#endif
-#include
+#include
-namespace raft_proto {
+namespace nvforest {
template
void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset)
{
- detail::copy(dst + dst_offset, src + src_offset, size, cuda_stream{});
+ copy(dst + dst_offset, src + src_offset, size, cuda_stream{});
}
template
void copy(
T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, cuda_stream stream)
{
- detail::copy(dst + dst_offset, src + src_offset, size, stream);
+ copy(dst + dst_offset, src + src_offset, size, stream);
}
template
void copy(T* dst, T const* src, uint32_t size)
{
- detail::copy(dst, src, size, cuda_stream{});
-}
-
-template
-void copy(T* dst, T const* src, uint32_t size, cuda_stream stream)
-{
- detail::copy(dst, src, size, stream);
+ copy(dst, src, size, cuda_stream{});
}
template
@@ -49,17 +43,13 @@ void copy(T* dst,
cuda_stream stream)
{
if (dst_type == device_type::gpu && src_type == device_type::gpu) {
- detail::copy(
- dst + dst_offset, src + src_offset, size, stream);
+ copy(dst + dst_offset, src + src_offset, size, stream);
} else if (dst_type == device_type::cpu && src_type == device_type::cpu) {
- detail::copy(
- dst + dst_offset, src + src_offset, size, stream);
+ copy(dst + dst_offset, src + src_offset, size, stream);
} else if (dst_type == device_type::gpu && src_type == device_type::cpu) {
- detail::copy(
- dst + dst_offset, src + src_offset, size, stream);
+ copy(dst + dst_offset, src + src_offset, size, stream);
} else if (dst_type == device_type::cpu && src_type == device_type::gpu) {
- detail::copy(
- dst + dst_offset, src + src_offset, size, stream);
+ copy(dst + dst_offset, src + src_offset, size, stream);
}
}
@@ -80,4 +70,4 @@ void copy(T* dst,
copy(dst, src, size, dst_type, src_type, 0, 0, stream);
}
-} // namespace raft_proto
+} // namespace nvforest
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/copy/cpu.hpp b/cpp/include/nvforest/detail/copy/cpu.hpp
similarity index 71%
rename from cpp/include/nvforest/detail/raft_proto/detail/copy/cpu.hpp
rename to cpp/include/nvforest/detail/copy/cpu.hpp
index c519c5b..796a97b 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/copy/cpu.hpp
+++ b/cpp/include/nvforest/detail/copy/cpu.hpp
@@ -3,16 +3,16 @@
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
-#include
-#include
-#include
+#include
+#include
+#include
#include
#include
#include
-namespace raft_proto::detail {
+namespace nvforest {
template
std::enable_if_t,
@@ -27,11 +27,11 @@ template
std::enable_if_t<
std::conjunction_v,
std::bool_constant>,
- std::bool_constant>,
+ std::bool_constant>,
void>
copy(T* dst, T const* src, uint32_t size, cuda_stream stream)
{
- throw gpu_unsupported("Copying from or to device in non-GPU build");
+ throw detail::gpu_unsupported("Copying from or to device in non-GPU build");
}
-} // namespace raft_proto::detail
+} // namespace nvforest
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/copy/gpu.hpp b/cpp/include/nvforest/detail/copy/gpu.hpp
similarity index 58%
rename from cpp/include/nvforest/detail/raft_proto/detail/copy/gpu.hpp
rename to cpp/include/nvforest/detail/copy/gpu.hpp
index b5bcded..94ae559 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/copy/gpu.hpp
+++ b/cpp/include/nvforest/detail/copy/gpu.hpp
@@ -3,9 +3,9 @@
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
-#include
-#include
-#include
+#include
+#include