diff --git a/build.sh b/build.sh
index 541c784..6497744 100755
--- a/build.sh
+++ b/build.sh
@@ -291,9 +291,9 @@ if (! hasArg --configure-only) && (completeBuild || hasArg libnvforest); then
           fi
         fi
         MSG="${MSG}<br/>parallel setting: $PARALLEL_LEVEL"
-        if [[ -f "${LIBNVFOREST_BUILD_DIR}/libnvforest++.so" ]]; then
-            LIBNVFOREST_FS=$(find "${LIBNVFOREST_BUILD_DIR}" -name libnvforest++.so -printf '%s'| awk '{printf "%.2f MB", $1/1024/1024}')
-            MSG="${MSG}<br/>libnvforest++.so size: $LIBNVFOREST_FS"
+        if [[ -f "${LIBNVFOREST_BUILD_DIR}/libnvforest.so" ]]; then
+            LIBNVFOREST_FS=$(find "${LIBNVFOREST_BUILD_DIR}" -name libnvforest.so -printf '%s'| awk '{printf "%.2f MB", $1/1024/1024}')
+            MSG="${MSG}<br/>libnvforest.so size: $LIBNVFOREST_FS"
         fi
         BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIBNVFOREST_BUILD_DIR}"}
         echo "The HTML report can be found at [${BMR_DIR}/ninja_log.html]. In CI, this report"
diff --git a/ci/build_wheel_nvforest.sh b/ci/build_wheel_nvforest.sh
index 7960ce2..ce96a70 100755
--- a/ci/build_wheel_nvforest.sh
+++ b/ci/build_wheel_nvforest.sh
@@ -20,7 +20,7 @@ LIBNVFOREST_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libnvforest_${RAPIDS_PY_CUDA_SUFF
 echo "libnvforest-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo "${LIBNVFOREST_WHEELHOUSE}"/libnvforest_*.whl)" >> "${PIP_CONSTRAINT}"
 
 EXCLUDE_ARGS=(
-  --exclude "libnvforest++.so"
+  --exclude "libnvforest.so"
   --exclude "libraft.so"
   --exclude "libcublas.so.*"
   --exclude "libcublasLt.so.*"
diff --git a/conda/recipes/libnvforest/recipe.yaml b/conda/recipes/libnvforest/recipe.yaml
index 3495392..26ca5b9 100644
--- a/conda/recipes/libnvforest/recipe.yaml
+++ b/conda/recipes/libnvforest/recipe.yaml
@@ -91,7 +91,7 @@ outputs:
       prefix_detection:
         ignore:
           # See https://github.com/rapidsai/build-planning/issues/160
-          - lib/libnvforest++.so
+          - lib/libnvforest.so
       string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
     requirements:
       build:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fb9d015..8dbaa15 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -60,7 +60,7 @@ option(NVTX "Enable nvtx markers" OFF)
 option(USE_CCACHE "Cache build artifacts with ccache" OFF)
 option(NVFOREST_USE_RAFT_STATIC "Build and statically link the RAFT library" OFF)
 option(NVFOREST_USE_TREELITE_STATIC "Build and statically link the treelite library" OFF)
-option(NVFOREST_EXPORT_TREELITE_LINKAGE "Whether to publicly or privately link treelite to libnvforest++" OFF)
+option(NVFOREST_EXPORT_TREELITE_LINKAGE "Whether to publicly or privately link treelite to libnvforest" OFF)
 option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
 
 # The options below allow incorporating libnvforest into another build process without installing all its components.
@@ -123,7 +123,7 @@ endif()
 # ######################################################################################################################
 # * Target names -------------------------------------------------------------
 
-set(NVFOREST_CPP_TARGET "nvforest++")
+set(NVFOREST_CPP_TARGET "nvforest")
 
 # ######################################################################################################################
 # * Conda environment detection ----------------------------------------------
@@ -193,7 +193,7 @@ if(BUILD_NVFOREST_TESTS)
 endif()
 
 # ######################################################################################################################
-# * build libnvforest++ shared library -------------------------------------------
+# * build libnvforest shared library -------------------------------------------
 
 file(
   WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
diff --git a/cpp/include/nvforest/Implementation.md b/cpp/include/nvforest/Implementation.md
index f34224b..b0bc595 100644
--- a/cpp/include/nvforest/Implementation.md
+++ b/cpp/include/nvforest/Implementation.md
@@ -6,17 +6,6 @@ does *not* require nvcc, CUDA or any other GPU-related library for its CPU-only
 build, we also go over general strategies for CPU/GPU interoperability as used
 by nvForest.
 
-**A NOTE ON THE `raft_proto` NAMESPACE:** In addition to nvForest-specific code, the new
-implementation requires some more general-purpose CPU-GPU interoperable
-utilities. Many of these utilities are either already implemented in RAFT (but
-do not provide the required CPU-interoperable compilation guarantees) or are a
-natural fit for incorporation in RAFT. In order to allow for more careful
-integration with the existing RAFT codebase and interoperability
-strategies, these utilities are currently provided in the `raft_proto`
-namespace but will be moved into RAFT over time. Other algorithms should
-not make use of the `raft_proto` namespace but instead wait until this
-transition has taken place.
-
 ## Design Goals
 1. Provide state-of-the-art runtime performance for forest models on GPU,
    especially for cases where CPU performance will not suffice (e.g. large
@@ -43,7 +32,7 @@ codebase.
 
 It is also occasionally useful to make use of a `constexpr` value
 indicating whether or not `NVFOREST_ENABLE_GPU` is set, which we introduce as
-`raft_proto::GPU_ENABLED`.
+`nvforest::detail::GPU_ENABLED`.
 
 ### Avoiding CUDA symbols in CPU-only builds
 The most significant challenge of attempting to create a unified CPU/GPU
@@ -88,7 +77,7 @@ between GPU and CPU.
 Where we _need_ to provide distinct logic between GPU and CPU
 implementations, we do so in implementation headers. In `infer/cpu.hpp`, we
 have a fully-defined template for CPU specializations of
-`detail::inference::infer`. If `raft_proto::GPU_ENABLED` is `false`, we also
+`detail::inference::infer`. If `nvforest::detail::GPU_ENABLED` is `false`, we also
 include the GPU specializations, which will simply throw an exception if
 invoked. In `infer/gpu.hpp` we *declare* but do not *define* the GPU
 specializations. In `infer/gpu.cuh` we provide the full working definition for
@@ -158,8 +147,8 @@ a standard benchmark) on the CPU.
 
 With some motivation for the general approach to CPU-GPU interoperability, we
 now offer an overview of the layout of the codebase to help guide future
-improvements. Because `raft_proto` utilities are going to be moved to RAFT or other
-general-purpose libraries, we will not review anything within the `raft_proto`
+improvements. Because `nvforest::detail` utilities are going to be moved to RAFT or other
+general-purpose libraries, we will not review anything within the `nvforest::detail`
 directory here.
 
 ### Public Headers
diff --git a/cpp/include/nvforest/README.md b/cpp/include/nvforest/README.md
index 3e3df17..711ab33 100644
--- a/cpp/include/nvforest/README.md
+++ b/cpp/include/nvforest/README.md
@@ -19,13 +19,6 @@ available in the top-level include directory. The `detail` directory
 contains implementation details that are not required to use nvForest and which
 will certainly change over time.
 
-**A NOTE ON THE `raft_proto` NAMESPACE:** For the first iteration of this nvForest
-implementation, much of the more general-purpose CPU-GPU interoperable code
-has temporarily been put in the `raft_proto` namespace. As the name suggests,
-the intention is that most or all of this functionality will either be moved
-to RAFT or that RAFT features will be updated to provide CPU-GPU
-compatible versions of the same.
-
 ### Importing a model
 nvForest uses Treelite as a common translation layer for all its input types.
 To load a forest model, we first create a Treelite model handle as
@@ -50,7 +43,7 @@ auto nvforest_model = import_from_treelite_model(
   tree_layout::depth_first, // layout
   128u,  // align_bytes
   false,  // use_double_precision
-  raft_proto::device_type::gpu,  // mem_type
+  nvforest::device_type::gpu,  // mem_type
   0,  // device_id
   stream  // CUDA stream
 );
@@ -74,7 +67,7 @@ serialization format will be used. Otherwise, the model will be evaluated
 at double precision if this value is set to `true` or single precision if this
 value is set to `false`.
 
-**dev_type**: This argument controls where the model will be executed. If `raft_proto::device_type::gpu`, then it will be executed on GPU. If `raft_proto::device_type::cpu`, then it will be executed on CPU.
+**dev_type**: This argument controls where the model will be executed. If `nvforest::device_type::gpu`, then it will be executed on GPU. If `nvforest::device_type::cpu`, then it will be executed on CPU.
 
 **device_id**: This integer indicates the ID of the GPU which should be used.
 If CPU is being used, this argument is ignored.
@@ -82,9 +75,9 @@ If CPU is being used, this argument is ignored.
 **stream**: The CUDA stream which will be used for the actual model import.
 If CPU is being used, this argument is ignored. Note that you do *not* need
 CUDA headers if you are working with a CPU-only build of nvForest. This
-argument uses a `raft_proto::cuda_stream` type which evaluates to a
+argument uses a `nvforest::cuda_stream` type which evaluates to a
 placeholder type in CPU-only builds. For applications which themselves want to
-implement CPU-GPU interoperable builds, the `raft_proto::cuda_stream` type can be
+implement CPU-GPU interoperable builds, the `nvforest::cuda_stream` type can be
 used directly.
 
 
@@ -106,24 +99,24 @@ cudaMalloc((void**)&output, num_rows * num_outputs * sizeof(float));
 
 // Assuming that input is a float* pointing to data already located on-device
 
-auto handle = raft_proto::handle_t{};
+auto handle = nvforest::handle_t{};
 
 nvforest_model.predict(
   handle,
   output,
   input,
   num_rows,
-  raft_proto::device_type::gpu,  // out_mem_type
-  raft_proto::device_type::gpu,  // in_mem_type
+  nvforest::device_type::gpu,  // out_mem_type
+  nvforest::device_type::gpu,  // in_mem_type
   4  // chunk_size
 );
 ```
 
 **handle**: To provide a unified interface on CPU and GPU, we introduce
-`raft_proto::handle_t` as a wrapper for `raft::handle_t`. This is currently just a
+`nvforest::handle_t` as a wrapper for `raft::handle_t`. This is currently just a
 placeholder in CPU-only builds, and using it does not require any CUDA
 functionality. For GPU-enabled builds, you can construct a
-`raft_proto_handle_t` directly from the `raft::handle_t` you wish to use.
+`nvforest::handle_t` directly from the `raft::handle_t` you wish to use.
 
 **output**: Pointer to pre-allocated buffer where results should be
 written. If the model has been loaded at single precision, this should be a
diff --git a/cpp/include/nvforest/detail/raft_proto/buffer.hpp b/cpp/include/nvforest/buffer.hpp
similarity index 69%
rename from cpp/include/nvforest/detail/raft_proto/buffer.hpp
rename to cpp/include/nvforest/buffer.hpp
index 7490e48..deebf60 100644
--- a/cpp/include/nvforest/detail/raft_proto/buffer.hpp
+++ b/cpp/include/nvforest/buffer.hpp
@@ -3,15 +3,15 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/detail/const_agnostic.hpp>
-#include <nvforest/detail/raft_proto/detail/copy.hpp>
-#include <nvforest/detail/raft_proto/detail/non_owning_buffer.hpp>
-#include <nvforest/detail/raft_proto/detail/owning_buffer.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/exceptions.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/const_agnostic.hpp>
+#include <nvforest/detail/copy.hpp>
+#include <nvforest/detail/device_id.hpp>
+#include <nvforest/detail/exceptions.hpp>
+#include <nvforest/detail/gpu_support.hpp>
+#include <nvforest/detail/non_owning_buffer.hpp>
+#include <nvforest/detail/owning_buffer.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <stdint.h>
 
@@ -21,7 +21,8 @@
 #include <utility>
 #include <variant>
 
-namespace raft_proto {
+namespace nvforest {
+
 /**
  * @brief A container which may or may not own its own data on host or device
  *
@@ -31,10 +32,10 @@ struct buffer {
   using index_type = std::size_t;
   using value_type = T;
 
-  using data_store = std::variant<non_owning_buffer<device_type::cpu, T>,
-                                  non_owning_buffer<device_type::gpu, T>,
-                                  owning_buffer<device_type::cpu, T>,
-                                  owning_buffer<device_type::gpu, T>>;
+  using data_store = std::variant<detail::non_owning_buffer<device_type::cpu, T>,
+                                  detail::non_owning_buffer<device_type::gpu, T>,
+                                  detail::owning_buffer<device_type::cpu, T>,
+                                  detail::owning_buffer<device_type::gpu, T>>;
 
   buffer() : device_{}, data_{}, size_{}, cached_ptr{nullptr} {}
 
@@ -44,19 +45,19 @@ struct buffer {
          int device           = 0,
          cuda_stream stream   = 0)
     : device_{[mem_type, &device]() {
-        auto result = device_id_variant{};
+        auto result = detail::device_id_variant{};
         switch (mem_type) {
-          case device_type::cpu: result = device_id<device_type::cpu>{device}; break;
-          case device_type::gpu: result = device_id<device_type::gpu>{device}; break;
+          case device_type::cpu: result = detail::device_id<device_type::cpu>{device}; break;
+          case device_type::gpu: result = detail::device_id<device_type::gpu>{device}; break;
         }
         return result;
       }()},
       data_{[this, mem_type, size, stream]() {
         auto result = data_store{};
         switch (mem_type) {
-          case device_type::cpu: result = owning_buffer<device_type::cpu, T>{size}; break;
+          case device_type::cpu: result = detail::owning_buffer<device_type::cpu, T>{size}; break;
           case device_type::gpu:
-            result = owning_buffer<device_type::gpu, T>{std::get<1>(device_), size, stream};
+            result = detail::owning_buffer<device_type::gpu, T>{std::get<1>(device_), size, stream};
             break;
         }
         return result;
@@ -78,18 +79,22 @@ struct buffer {
   /** Construct non-owning buffer */
   buffer(T* input_data, index_type size, device_type mem_type = device_type::cpu, int device = 0)
     : device_{[mem_type, &device]() {
-        auto result = device_id_variant{};
+        auto result = detail::device_id_variant{};
         switch (mem_type) {
-          case device_type::cpu: result = device_id<device_type::cpu>{device}; break;
-          case device_type::gpu: result = device_id<device_type::gpu>{device}; break;
+          case device_type::cpu: result = detail::device_id<device_type::cpu>{device}; break;
+          case device_type::gpu: result = detail::device_id<device_type::gpu>{device}; break;
         }
         return result;
       }()},
       data_{[input_data, mem_type]() {
         auto result = data_store{};
         switch (mem_type) {
-          case device_type::cpu: result = non_owning_buffer<device_type::cpu, T>{input_data}; break;
-          case device_type::gpu: result = non_owning_buffer<device_type::gpu, T>{input_data}; break;
+          case device_type::cpu:
+            result = detail::non_owning_buffer<device_type::cpu, T>{input_data};
+            break;
+          case device_type::gpu:
+            result = detail::non_owning_buffer<device_type::gpu, T>{input_data};
+            break;
         }
         return result;
       }()},
@@ -118,10 +123,10 @@ struct buffer {
          int device         = 0,
          cuda_stream stream = cuda_stream{})
     : device_{[mem_type, &device]() {
-        auto result = device_id_variant{};
+        auto result = detail::device_id_variant{};
         switch (mem_type) {
-          case device_type::cpu: result = device_id<device_type::cpu>{device}; break;
-          case device_type::gpu: result = device_id<device_type::gpu>{device}; break;
+          case device_type::cpu: result = detail::device_id<device_type::cpu>{device}; break;
+          case device_type::gpu: result = detail::device_id<device_type::gpu>{device}; break;
         }
         return result;
       }()},
@@ -129,11 +134,12 @@ struct buffer {
         auto result      = data_store{};
         auto result_data = static_cast<T*>(nullptr);
         if (mem_type == device_type::cpu) {
-          auto buf    = owning_buffer<device_type::cpu, T>(other.size());
+          auto buf    = detail::owning_buffer<device_type::cpu, T>(other.size());
           result_data = buf.get();
           result      = std::move(buf);
         } else if (mem_type == device_type::gpu) {
-          auto buf = owning_buffer<device_type::gpu, T>(std::get<1>(device_), other.size(), stream);
+          auto buf =
+            detail::owning_buffer<device_type::gpu, T>(std::get<1>(device_), other.size(), stream);
           result_data = buf.get();
           result      = std::move(buf);
         }
@@ -188,10 +194,10 @@ struct buffer {
    */
   buffer(buffer<T>&& other, device_type mem_type, int device, cuda_stream stream)
     : device_{[mem_type, &device]() {
-        auto result = device_id_variant{};
+        auto result = detail::device_id_variant{};
         switch (mem_type) {
-          case device_type::cpu: result = device_id<device_type::cpu>{device}; break;
-          case device_type::gpu: result = device_id<device_type::gpu>{device}; break;
+          case device_type::cpu: result = detail::device_id<device_type::cpu>{device}; break;
+          case device_type::gpu: result = detail::device_id<device_type::gpu>{device}; break;
         }
         return result;
       }()},
@@ -202,11 +208,11 @@ struct buffer {
         } else {
           auto* result_data = static_cast<T*>(nullptr);
           if (mem_type == device_type::cpu) {
-            auto buf    = owning_buffer<device_type::cpu, T>{other.size()};
+            auto buf    = detail::owning_buffer<device_type::cpu, T>{other.size()};
             result_data = buf.get();
             result      = std::move(buf);
           } else if (mem_type == device_type::gpu) {
-            auto buf    = owning_buffer<device_type::gpu, T>{device, other.size(), stream};
+            auto buf    = detail::owning_buffer<device_type::gpu, T>{device, other.size(), stream};
             result_data = buf.get();
             result      = std::move(buf);
           }
@@ -306,23 +312,23 @@ struct buffer {
   ~buffer() = default;
 
  private:
-  device_id_variant device_;
+  detail::device_id_variant device_;
   data_store data_;
   index_type size_;
   T* cached_ptr;
 };
 
 template <bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>& dst,
-                                 buffer<U> const& src,
-                                 typename buffer<T>::index_type dst_offset,
-                                 typename buffer<U>::index_type src_offset,
-                                 typename buffer<T>::index_type size,
-                                 cuda_stream stream)
+detail::const_agnostic_same_t<T, U> copy(buffer<T>& dst,
+                                         buffer<U> const& src,
+                                         typename buffer<T>::index_type dst_offset,
+                                         typename buffer<U>::index_type src_offset,
+                                         typename buffer<T>::index_type size,
+                                         cuda_stream stream)
 {
   if constexpr (bounds_check) {
     if (src.size() - src_offset < size || dst.size() - dst_offset < size) {
-      throw out_of_bounds("Attempted copy to or from buffer of inadequate size");
+      throw detail::out_of_bounds("Attempted copy to or from buffer of inadequate size");
     }
   }
   copy(dst.data() + dst_offset,
@@ -334,27 +340,27 @@ const_agnostic_same_t<T, U> copy(buffer<T>& dst,
 }
 
 template <bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>& dst, buffer<U> const& src, cuda_stream stream)
+detail::const_agnostic_same_t<T, U> copy(buffer<T>& dst, buffer<U> const& src, cuda_stream stream)
 {
   copy<bounds_check>(dst, src, 0, 0, src.size(), stream);
 }
 template <bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>& dst, buffer<U> const& src)
+detail::const_agnostic_same_t<T, U> copy(buffer<T>& dst, buffer<U> const& src)
 {
   copy<bounds_check>(dst, src, 0, 0, src.size(), cuda_stream{});
 }
 
 template <bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>&& dst,
-                                 buffer<U>&& src,
-                                 typename buffer<T>::index_type dst_offset,
-                                 typename buffer<U>::index_type src_offset,
-                                 typename buffer<T>::index_type size,
-                                 cuda_stream stream)
+detail::const_agnostic_same_t<T, U> copy(buffer<T>&& dst,
+                                         buffer<U>&& src,
+                                         typename buffer<T>::index_type dst_offset,
+                                         typename buffer<U>::index_type src_offset,
+                                         typename buffer<T>::index_type size,
+                                         cuda_stream stream)
 {
   if constexpr (bounds_check) {
     if (src.size() - src_offset < size || dst.size() - dst_offset < size) {
-      throw out_of_bounds("Attempted copy to or from buffer of inadequate size");
+      throw detail::out_of_bounds("Attempted copy to or from buffer of inadequate size");
     }
   }
   copy(dst.data() + dst_offset,
@@ -366,23 +372,23 @@ const_agnostic_same_t<T, U> copy(buffer<T>&& dst,
 }
 
 template <bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>&& dst,
-                                 buffer<U>&& src,
-                                 typename buffer<T>::index_type dst_offset,
-                                 cuda_stream stream)
+detail::const_agnostic_same_t<T, U> copy(buffer<T>&& dst,
+                                         buffer<U>&& src,
+                                         typename buffer<T>::index_type dst_offset,
+                                         cuda_stream stream)
 {
   copy<bounds_check>(dst, src, dst_offset, 0, src.size(), stream);
 }
 
 template <bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src, cuda_stream stream)
+detail::const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src, cuda_stream stream)
 {
   copy<bounds_check>(dst, src, 0, 0, src.size(), stream);
 }
 template <bool bounds_check, typename T, typename U>
-const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src)
+detail::const_agnostic_same_t<T, U> copy(buffer<T>&& dst, buffer<U>&& src)
 {
   copy<bounds_check>(dst, src, 0, 0, src.size(), cuda_stream{});
 }
 
-}  // namespace raft_proto
+}  // namespace nvforest
diff --git a/cpp/include/nvforest/detail/raft_proto/cuda_stream.hpp b/cpp/include/nvforest/cuda_stream.hpp
similarity index 71%
rename from cpp/include/nvforest/detail/raft_proto/cuda_stream.hpp
rename to cpp/include/nvforest/cuda_stream.hpp
index f80c488..d231698 100644
--- a/cpp/include/nvforest/detail/raft_proto/cuda_stream.hpp
+++ b/cpp/include/nvforest/cuda_stream.hpp
@@ -7,7 +7,7 @@
 #include <cuda_runtime_api.h>
 #endif
 
-namespace raft_proto {
+namespace nvforest {
 #ifdef NVFOREST_ENABLE_GPU
 using cuda_stream = cudaStream_t;
 #else
@@ -19,4 +19,9 @@ inline void synchronize(cuda_stream stream)
   cudaStreamSynchronize(stream);
 #endif
 }
-}  // namespace raft_proto
+}  // namespace nvforest
+
+namespace nvforest::detail {
+using nvforest::cuda_stream;
+using nvforest::synchronize;
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/decision_forest.hpp b/cpp/include/nvforest/decision_forest.hpp
index e0c5a9d..6534196 100644
--- a/cpp/include/nvforest/decision_forest.hpp
+++ b/cpp/include/nvforest/decision_forest.hpp
@@ -3,15 +3,16 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include <nvforest/buffer.hpp>
 #include <nvforest/constants.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/ceildiv.hpp>
 #include <nvforest/detail/device_initialization.hpp>
+#include <nvforest/detail/exceptions.hpp>
 #include <nvforest/detail/forest.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/infer.hpp>
 #include <nvforest/detail/postprocessor.hpp>
-#include <nvforest/detail/raft_proto/buffer.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/exceptions.hpp>
 #include <nvforest/detail/specialization_types.hpp>
 #include <nvforest/exceptions.hpp>
 #include <nvforest/infer_kind.hpp>
@@ -145,21 +146,21 @@ struct decision_forest {
    * operations, including sigmoid, exponential, and
    * logarithm_one_plus_exp
    */
-  decision_forest(raft_proto::buffer<node_type>&& nodes,
-                  raft_proto::buffer<index_type>&& root_node_indexes,
-                  raft_proto::buffer<index_type>&& node_id_mapping,
-                  raft_proto::buffer<io_type>&& bias,
-                  index_type num_features,
-                  index_type num_outputs                                     = index_type{2},
-                  bool has_categorical_nodes                                 = false,
-                  std::optional<raft_proto::buffer<io_type>>&& vector_output = std::nullopt,
-                  std::optional<raft_proto::buffer<typename node_type::index_type>>&&
-                    categorical_storage     = std::nullopt,
-                  index_type leaf_size      = index_type{1},
-                  row_op row_postproc       = row_op::disable,
-                  element_op elem_postproc  = element_op::disable,
-                  io_type average_factor    = io_type{1},
-                  io_type postproc_constant = io_type{1})
+  decision_forest(
+    buffer<node_type>&& nodes,
+    buffer<index_type>&& root_node_indexes,
+    buffer<index_type>&& node_id_mapping,
+    buffer<io_type>&& bias,
+    index_type num_features,
+    index_type num_outputs                                                      = index_type{2},
+    bool has_categorical_nodes                                                  = false,
+    std::optional<buffer<io_type>>&& vector_output                              = std::nullopt,
+    std::optional<buffer<typename node_type::index_type>>&& categorical_storage = std::nullopt,
+    index_type leaf_size                                                        = index_type{1},
+    row_op row_postproc                                                         = row_op::disable,
+    element_op elem_postproc  = element_op::disable,
+    io_type average_factor    = io_type{1},
+    io_type postproc_constant = io_type{1})
     : nodes_{nodes},
       root_node_indexes_{root_node_indexes},
       node_id_mapping_{node_id_mapping},
@@ -176,11 +177,11 @@ struct decision_forest {
       postproc_constant_{postproc_constant}
   {
     if (nodes.memory_type() != root_node_indexes.memory_type()) {
-      throw raft_proto::mem_type_mismatch(
+      throw detail::mem_type_mismatch(
         "Nodes and indexes of forest must both be stored on either host or device");
     }
     if (nodes.device_index() != root_node_indexes.device_index()) {
-      throw raft_proto::mem_type_mismatch(
+      throw detail::mem_type_mismatch(
         "Nodes and indexes of forest must both be stored on same device");
     }
     detail::initialize_device<forest_type>(nodes.device());
@@ -245,18 +246,18 @@ struct decision_forest {
    * 1 to 32 is a valid value, and in general larger batches benefit from
    * larger values.
    */
-  void predict(raft_proto::buffer<typename forest_type::io_type>& output,
-               raft_proto::buffer<typename forest_type::io_type> const& input,
-               raft_proto::cuda_stream stream                          = raft_proto::cuda_stream{},
+  void predict(buffer<typename forest_type::io_type>& output,
+               buffer<typename forest_type::io_type> const& input,
+               nvforest::cuda_stream stream                            = nvforest::cuda_stream{},
                infer_kind predict_type                                 = infer_kind::default_kind,
                std::optional<index_type> specified_rows_per_block_iter = std::nullopt)
   {
     if (output.memory_type() != memory_type() || input.memory_type() != memory_type()) {
-      throw raft_proto::wrong_device_type{
+      throw detail::wrong_device_type{
         "Tried to use host I/O data with model on device or vice versa"};
     }
     if (output.device_index() != device_index() || input.device_index() != device_index()) {
-      throw raft_proto::wrong_device{"I/O data on different device than model"};
+      throw detail::wrong_device{"I/O data on different device than model"};
     }
     auto* vector_output_data =
       (vector_output_.has_value() ? vector_output_->data() : static_cast<io_type*>(nullptr));
@@ -265,54 +266,54 @@ struct decision_forest {
                                         : static_cast<categorical_storage_type*>(nullptr));
     switch (nodes_.device().index()) {
       case 0:
-        nvforest::detail::infer(obj(),
-                                get_postprocessor(predict_type),
-                                output.data(),
-                                input.data(),
-                                index_type(input.size() / num_features_),
-                                num_features_,
-                                num_outputs(predict_type),
-                                has_categorical_nodes_,
-                                vector_output_data,
-                                categorical_storage_data,
-                                predict_type,
-                                specified_rows_per_block_iter,
-                                std::get<0>(nodes_.device()),
-                                stream);
+        detail::infer(obj(),
+                      get_postprocessor(predict_type),
+                      output.data(),
+                      input.data(),
+                      index_type(input.size() / num_features_),
+                      num_features_,
+                      num_outputs(predict_type),
+                      has_categorical_nodes_,
+                      vector_output_data,
+                      categorical_storage_data,
+                      predict_type,
+                      specified_rows_per_block_iter,
+                      std::get<0>(nodes_.device()),
+                      stream);
         break;
       case 1:
-        nvforest::detail::infer(obj(),
-                                get_postprocessor(predict_type),
-                                output.data(),
-                                input.data(),
-                                index_type(input.size() / num_features_),
-                                num_features_,
-                                num_outputs(predict_type),
-                                has_categorical_nodes_,
-                                vector_output_data,
-                                categorical_storage_data,
-                                predict_type,
-                                specified_rows_per_block_iter,
-                                std::get<1>(nodes_.device()),
-                                stream);
+        detail::infer(obj(),
+                      get_postprocessor(predict_type),
+                      output.data(),
+                      input.data(),
+                      index_type(input.size() / num_features_),
+                      num_features_,
+                      num_outputs(predict_type),
+                      has_categorical_nodes_,
+                      vector_output_data,
+                      categorical_storage_data,
+                      predict_type,
+                      specified_rows_per_block_iter,
+                      std::get<1>(nodes_.device()),
+                      stream);
         break;
     }
   }
 
  private:
   /** The nodes for all trees in the forest */
-  raft_proto::buffer<node_type> nodes_;
+  buffer<node_type> nodes_;
   /** The index of the root node for each tree in the forest */
-  raft_proto::buffer<index_type> root_node_indexes_;
+  buffer<index_type> root_node_indexes_;
   /** Mapping to apply to node IDs. Only relevant when predict_type == infer_kind::leaf_id */
-  raft_proto::buffer<index_type> node_id_mapping_;
+  buffer<index_type> node_id_mapping_;
   /** Bias term to apply to the output */
-  raft_proto::buffer<io_type> bias_;
+  buffer<io_type> bias_;
   /** Buffer of outputs for all leaves in vector-leaf models */
-  std::optional<raft_proto::buffer<io_type>> vector_output_;
+  std::optional<buffer<io_type>> vector_output_;
   /** Buffer of elements used as backing data for bitsets which specify
    * categories for all categorical nodes in the model. */
-  std::optional<raft_proto::buffer<categorical_storage_type>> categorical_storage_;
+  std::optional<buffer<categorical_storage_type>> categorical_storage_;
 
   // Metadata
   index_type num_features_;
@@ -458,7 +459,7 @@ inline auto get_forest_variant_index(bool use_double_thresholds,
   // TODO(wphicks): We are overestimating categorical storage required here
   auto double_indexes_required =
     (max_num_categories > max_local_categories &&
-     ((raft_proto::ceildiv(max_num_categories, max_local_categories) + 1 * num_categorical_nodes) >
+     ((detail::ceildiv(max_num_categories, max_local_categories) + 1 * num_categorical_nodes) >
       std::numeric_limits<small_index_t>::max())) ||
     num_vector_leaves > std::numeric_limits<small_index_t>::max();
 
diff --git a/cpp/include/nvforest/detail/bitset.hpp b/cpp/include/nvforest/detail/bitset.hpp
index c11cdee..c166eb3 100644
--- a/cpp/include/nvforest/detail/bitset.hpp
+++ b/cpp/include/nvforest/detail/bitset.hpp
@@ -3,8 +3,8 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/detail/gpu_support.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <cstddef>
 #include <type_traits>
diff --git a/cpp/include/nvforest/detail/raft_proto/ceildiv.hpp b/cpp/include/nvforest/detail/ceildiv.hpp
similarity index 79%
rename from cpp/include/nvforest/detail/raft_proto/ceildiv.hpp
rename to cpp/include/nvforest/detail/ceildiv.hpp
index 8f4fb6c..4d6245b 100644
--- a/cpp/include/nvforest/detail/raft_proto/ceildiv.hpp
+++ b/cpp/include/nvforest/detail/ceildiv.hpp
@@ -3,15 +3,15 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/detail/gpu_support.hpp>
 
 #include <type_traits>
 
-namespace raft_proto {
+namespace nvforest::detail {
 template <typename T, typename U>
 HOST DEVICE auto constexpr ceildiv(T dividend, U divisor)
 {
   static_assert(std::is_integral_v<T> && std::is_integral_v<U>, "Arguments must be integers");
   return dividend / divisor + (dividend % divisor != 0);
 }
-}  // namespace raft_proto
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/const_agnostic.hpp b/cpp/include/nvforest/detail/const_agnostic.hpp
new file mode 100644
index 0000000..25d8a45
--- /dev/null
+++ b/cpp/include/nvforest/detail/const_agnostic.hpp
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include <type_traits>
+
+namespace nvforest::detail {
+template <typename T, typename U, typename V = void>
+using const_agnostic_same_t =
+  std::enable_if_t<std::is_same_v<std::remove_const_t<T>, std::remove_const_t<U>>, V>;
+
+template <typename T, typename U>
+inline constexpr auto const_agnostic_same_v =
+  std::is_same_v<std::remove_const_t<T>, std::remove_const_t<U>>;
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/copy.hpp b/cpp/include/nvforest/detail/copy.hpp
similarity index 58%
rename from cpp/include/nvforest/detail/raft_proto/detail/copy.hpp
rename to cpp/include/nvforest/detail/copy.hpp
index fffd3ac..b02a996 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/copy.hpp
+++ b/cpp/include/nvforest/detail/copy.hpp
@@ -3,39 +3,33 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/detail/copy/cpu.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/copy/cpu.hpp>
 
 #include <stdint.h>
 #ifdef NVFOREST_ENABLE_GPU
-#include <nvforest/detail/raft_proto/detail/copy/gpu.hpp>
+#include <nvforest/detail/copy/gpu.hpp>
 #endif
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/device_type.hpp>
 
-namespace raft_proto {
+namespace nvforest {
 template <device_type dst_type, device_type src_type, typename T>
 void copy(T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset)
 {
-  detail::copy<dst_type, src_type, T>(dst + dst_offset, src + src_offset, size, cuda_stream{});
+  copy<dst_type, src_type, T>(dst + dst_offset, src + src_offset, size, cuda_stream{});
 }
 
 template <device_type dst_type, device_type src_type, typename T>
 void copy(
   T* dst, T const* src, uint32_t size, uint32_t dst_offset, uint32_t src_offset, cuda_stream stream)
 {
-  detail::copy<dst_type, src_type, T>(dst + dst_offset, src + src_offset, size, stream);
+  copy<dst_type, src_type, T>(dst + dst_offset, src + src_offset, size, stream);
 }
 
 template <device_type dst_type, device_type src_type, typename T>
 void copy(T* dst, T const* src, uint32_t size)
 {
-  detail::copy<dst_type, src_type, T>(dst, src, size, cuda_stream{});
-}
-
-template <device_type dst_type, device_type src_type, typename T>
-void copy(T* dst, T const* src, uint32_t size, cuda_stream stream)
-{
-  detail::copy<dst_type, src_type, T>(dst, src, size, stream);
+  copy<dst_type, src_type, T>(dst, src, size, cuda_stream{});
 }
 
 template <typename T>
@@ -49,17 +43,13 @@ void copy(T* dst,
           cuda_stream stream)
 {
   if (dst_type == device_type::gpu && src_type == device_type::gpu) {
-    detail::copy<device_type::gpu, device_type::gpu, T>(
-      dst + dst_offset, src + src_offset, size, stream);
+    copy<device_type::gpu, device_type::gpu, T>(dst + dst_offset, src + src_offset, size, stream);
   } else if (dst_type == device_type::cpu && src_type == device_type::cpu) {
-    detail::copy<device_type::cpu, device_type::cpu, T>(
-      dst + dst_offset, src + src_offset, size, stream);
+    copy<device_type::cpu, device_type::cpu, T>(dst + dst_offset, src + src_offset, size, stream);
   } else if (dst_type == device_type::gpu && src_type == device_type::cpu) {
-    detail::copy<device_type::gpu, device_type::cpu, T>(
-      dst + dst_offset, src + src_offset, size, stream);
+    copy<device_type::gpu, device_type::cpu, T>(dst + dst_offset, src + src_offset, size, stream);
   } else if (dst_type == device_type::cpu && src_type == device_type::gpu) {
-    detail::copy<device_type::cpu, device_type::gpu, T>(
-      dst + dst_offset, src + src_offset, size, stream);
+    copy<device_type::cpu, device_type::gpu, T>(dst + dst_offset, src + src_offset, size, stream);
   }
 }
 
@@ -80,4 +70,4 @@ void copy(T* dst,
   copy<T>(dst, src, size, dst_type, src_type, 0, 0, stream);
 }
 
-}  // namespace raft_proto
+}  // namespace nvforest
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/copy/cpu.hpp b/cpp/include/nvforest/detail/copy/cpu.hpp
similarity index 71%
rename from cpp/include/nvforest/detail/raft_proto/detail/copy/cpu.hpp
rename to cpp/include/nvforest/detail/copy/cpu.hpp
index c519c5b..796a97b 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/copy/cpu.hpp
+++ b/cpp/include/nvforest/detail/copy/cpu.hpp
@@ -3,16 +3,16 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/gpu_support.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <stdint.h>
 
 #include <algorithm>
 #include <cstring>
 
-namespace raft_proto::detail {
+namespace nvforest {
 
 template <device_type dst_type, device_type src_type, typename T>
 std::enable_if_t<std::conjunction_v<std::bool_constant<dst_type == device_type::cpu>,
@@ -27,11 +27,11 @@ template <device_type dst_type, device_type src_type, typename T>
 std::enable_if_t<
   std::conjunction_v<std::disjunction<std::bool_constant<dst_type != device_type::cpu>,
                                       std::bool_constant<src_type != device_type::cpu>>,
-                     std::bool_constant<!GPU_ENABLED>>,
+                     std::bool_constant<!detail::GPU_ENABLED>>,
   void>
 copy(T* dst, T const* src, uint32_t size, cuda_stream stream)
 {
-  throw gpu_unsupported("Copying from or to device in non-GPU build");
+  throw detail::gpu_unsupported("Copying from or to device in non-GPU build");
 }
 
-}  // namespace raft_proto::detail
+}  // namespace nvforest
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/copy/gpu.hpp b/cpp/include/nvforest/detail/copy/gpu.hpp
similarity index 58%
rename from cpp/include/nvforest/detail/raft_proto/detail/copy/gpu.hpp
rename to cpp/include/nvforest/detail/copy/gpu.hpp
index b5bcded..94ae559 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/copy/gpu.hpp
+++ b/cpp/include/nvforest/detail/copy/gpu.hpp
@@ -3,9 +3,9 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/cuda_check.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/cuda_check.hpp>
+#include <nvforest/detail/gpu_support.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -13,17 +13,17 @@
 
 #include <type_traits>
 
-namespace raft_proto::detail {
+namespace nvforest {
 
 template <device_type dst_type, device_type src_type, typename T>
 std::enable_if_t<
   std::conjunction_v<std::disjunction<std::bool_constant<dst_type == device_type::gpu>,
                                       std::bool_constant<src_type == device_type::gpu>>,
-                     std::bool_constant<GPU_ENABLED>>,
+                     std::bool_constant<detail::GPU_ENABLED>>,
   void>
 copy(T* dst, T const* src, uint32_t size, cuda_stream stream)
 {
-  raft_proto::cuda_check(cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDefault, stream));
+  detail::cuda_check(cudaMemcpyAsync(dst, src, size * sizeof(T), cudaMemcpyDefault, stream));
 }
 
-}  // namespace raft_proto::detail
+}  // namespace nvforest
diff --git a/cpp/include/nvforest/detail/cuda_check.hpp b/cpp/include/nvforest/detail/cuda_check.hpp
new file mode 100644
index 0000000..b5c8e58
--- /dev/null
+++ b/cpp/include/nvforest/detail/cuda_check.hpp
@@ -0,0 +1,19 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include <nvforest/detail/cuda_check/base.hpp>
+#ifdef NVFOREST_ENABLE_GPU
+#include <nvforest/detail/cuda_check/gpu.hpp>
+#endif
+#include <nvforest/detail/gpu_support.hpp>
+#include <nvforest/device_type.hpp>
+
+namespace nvforest::detail {
+template <typename error_t>
+void cuda_check(error_t const& err) noexcept(!GPU_ENABLED)
+{
+  cuda_check<device_type::gpu>(err);
+}
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/cuda_check/base.hpp b/cpp/include/nvforest/detail/cuda_check/base.hpp
similarity index 64%
rename from cpp/include/nvforest/detail/raft_proto/detail/cuda_check/base.hpp
rename to cpp/include/nvforest/detail/cuda_check/base.hpp
index 3f797d0..c697ddc 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/cuda_check/base.hpp
+++ b/cpp/include/nvforest/detail/cuda_check/base.hpp
@@ -3,13 +3,13 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/device_type.hpp>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 
 template <device_type D, typename error_t>
 void cuda_check(error_t const& err)
 {
 }
 
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/cuda_check/gpu.hpp b/cpp/include/nvforest/detail/cuda_check/gpu.hpp
similarity index 61%
rename from cpp/include/nvforest/detail/raft_proto/detail/cuda_check/gpu.hpp
rename to cpp/include/nvforest/detail/cuda_check/gpu.hpp
index e02b88e..917166c 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/cuda_check/gpu.hpp
+++ b/cpp/include/nvforest/detail/cuda_check/gpu.hpp
@@ -3,12 +3,12 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/detail/cuda_check/base.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/exceptions.hpp>
+#include <nvforest/detail/cuda_check/base.hpp>
+#include <nvforest/detail/exceptions.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <cuda_runtime_api.h>
-namespace raft_proto::detail {
+namespace nvforest::detail {
 
 template <>
 inline void cuda_check<device_type::gpu, cudaError_t>(cudaError_t const& err) noexcept(false)
@@ -19,4 +19,4 @@ inline void cuda_check<device_type::gpu, cudaError_t>(cudaError_t const& err) no
   }
 }
 
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/decision_forest_builder.hpp b/cpp/include/nvforest/detail/decision_forest_builder.hpp
index 8d33ed7..0795cdd 100644
--- a/cpp/include/nvforest/detail/decision_forest_builder.hpp
+++ b/cpp/include/nvforest/detail/decision_forest_builder.hpp
@@ -3,13 +3,13 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include <nvforest/buffer.hpp>
+#include <nvforest/cuda_stream.hpp>
 #include <nvforest/detail/bitset.hpp>
+#include <nvforest/detail/ceildiv.hpp>
 #include <nvforest/detail/forest.hpp>
 #include <nvforest/detail/index_type.hpp>
-#include <nvforest/detail/raft_proto/buffer.hpp>
-#include <nvforest/detail/raft_proto/ceildiv.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/device_type.hpp>
 #include <nvforest/exceptions.hpp>
 #include <nvforest/postproc_ops.hpp>
 
@@ -115,7 +115,7 @@ struct decision_forest_builder {
     }
     if (max_num_categories_ > bin_width) {
       node_value         = categorical_storage_.size();
-      auto bins_required = raft_proto::ceildiv(max_cat_plus_one, bin_width);
+      auto bins_required = ceildiv(max_cat_plus_one, bin_width);
       categorical_storage_.push_back(max_cat_plus_one);
       categorical_storage_.resize(categorical_storage_.size() + bins_required);
       set_storage = &(categorical_storage_[node_value + 1]);
@@ -228,12 +228,12 @@ struct decision_forest_builder {
   /* Return the nvForest decision forest built by this builder */
   auto get_decision_forest(index_type num_feature,
                            index_type num_class,
-                           raft_proto::device_type mem_type = raft_proto::device_type::cpu,
-                           int device                       = 0,
-                           raft_proto::cuda_stream stream   = raft_proto::cuda_stream{})
+                           device_type mem_type = device_type::cpu,
+                           int device           = 0,
+                           cuda_stream stream   = cuda_stream{})
   {
     // Set device = -1 when loading the model onto CPU
-    if (mem_type == raft_proto::device_type::cpu) { device = -1; }
+    if (mem_type == device_type::cpu) { device = -1; }
 
     // Validate forest invariants the inference kernel relies on. After this
     // function returns, the forest is treated as trusted by the kernel.
@@ -267,7 +267,7 @@ struct decision_forest_builder {
                                    " >= " + std::to_string(storage_size) + ")"};
         }
         auto const stored_num_cats = categorical_storage_[offset];
-        auto const bins_required   = raft_proto::ceildiv(stored_num_cats, cat_bin_width);
+        auto const bins_required   = ceildiv(stored_num_cats, cat_bin_width);
         auto const bits_begin      = static_cast<std::size_t>(offset) + std::size_t{1};
         auto const bits_end        = bits_begin + static_cast<std::size_t>(bins_required);
         if (bits_end > storage_size) {
@@ -295,31 +295,22 @@ struct decision_forest_builder {
     }
 
     return decision_forest_t{
-      raft_proto::buffer{
-        raft_proto::buffer{nodes_.data(), nodes_.size()}, mem_type, device, stream},
-      raft_proto::buffer{raft_proto::buffer{root_node_indexes_.data(), root_node_indexes_.size()},
-                         mem_type,
-                         device,
-                         stream},
-      raft_proto::buffer{raft_proto::buffer{node_id_mapping_.data(), node_id_mapping_.size()},
-                         mem_type,
-                         device,
-                         stream},
-      raft_proto::buffer{raft_proto::buffer{bias_.data(), bias_.size()}, mem_type, device, stream},
+      buffer{buffer{nodes_.data(), nodes_.size()}, mem_type, device, stream},
+      buffer{
+        buffer{root_node_indexes_.data(), root_node_indexes_.size()}, mem_type, device, stream},
+      buffer{buffer{node_id_mapping_.data(), node_id_mapping_.size()}, mem_type, device, stream},
+      buffer{buffer{bias_.data(), bias_.size()}, mem_type, device, stream},
       num_feature,
       num_class,
       max_num_categories_ != 0,
       vector_output_.empty()
         ? std::nullopt
-        : std::make_optional<raft_proto::buffer<typename node_type::threshold_type>>(
-            raft_proto::buffer{vector_output_.data(), vector_output_.size()},
-            mem_type,
-            device,
-            stream),
+        : std::make_optional<buffer<typename node_type::threshold_type>>(
+            buffer{vector_output_.data(), vector_output_.size()}, mem_type, device, stream),
       categorical_storage_.empty()
         ? std::nullopt
-        : std::make_optional<raft_proto::buffer<typename node_type::index_type>>(
-            raft_proto::buffer{categorical_storage_.data(), categorical_storage_.size()},
+        : std::make_optional<buffer<typename node_type::index_type>>(
+            buffer{categorical_storage_.data(), categorical_storage_.size()},
             mem_type,
             device,
             stream),
diff --git a/cpp/include/nvforest/detail/device_id.hpp b/cpp/include/nvforest/detail/device_id.hpp
new file mode 100644
index 0000000..76bd20d
--- /dev/null
+++ b/cpp/include/nvforest/detail/device_id.hpp
@@ -0,0 +1,18 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <nvforest/detail/device_id/base.hpp>
+#include <nvforest/detail/device_id/cpu.hpp>
+#ifdef NVFOREST_ENABLE_GPU
+#include <nvforest/detail/device_id/gpu.hpp>
+#endif
+#include <nvforest/device_type.hpp>
+
+#include <variant>
+
+namespace nvforest::detail {
+using device_id_variant = std::variant<device_id<device_type::cpu>, device_id<device_type::gpu>>;
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_id/base.hpp b/cpp/include/nvforest/detail/device_id/base.hpp
similarity index 70%
rename from cpp/include/nvforest/detail/raft_proto/detail/device_id/base.hpp
rename to cpp/include/nvforest/detail/device_id/base.hpp
index 214d1d8..fa56ad7 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/device_id/base.hpp
+++ b/cpp/include/nvforest/detail/device_id/base.hpp
@@ -3,9 +3,9 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/device_type.hpp>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 template <device_type D>
 struct device_id {
   using value_type = int;
@@ -13,4 +13,4 @@ struct device_id {
   device_id(value_type device_index) {}
   auto value() const { return value_type{}; }
 };
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_id/cpu.hpp b/cpp/include/nvforest/detail/device_id/cpu.hpp
similarity index 66%
rename from cpp/include/nvforest/detail/raft_proto/detail/device_id/cpu.hpp
rename to cpp/include/nvforest/detail/device_id/cpu.hpp
index 3c4eded..be7105a 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/device_id/cpu.hpp
+++ b/cpp/include/nvforest/detail/device_id/cpu.hpp
@@ -3,10 +3,10 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/detail/device_id/base.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/detail/device_id/base.hpp>
+#include <nvforest/device_type.hpp>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 template <>
 struct device_id<device_type::cpu> {
   using value_type = int;
@@ -18,4 +18,4 @@ struct device_id<device_type::cpu> {
  private:
   value_type id_;
 };
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_id/gpu.hpp b/cpp/include/nvforest/detail/device_id/gpu.hpp
similarity index 65%
rename from cpp/include/nvforest/detail/raft_proto/detail/device_id/gpu.hpp
rename to cpp/include/nvforest/detail/device_id/gpu.hpp
index 6248878..3cc0d01 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/device_id/gpu.hpp
+++ b/cpp/include/nvforest/detail/device_id/gpu.hpp
@@ -3,20 +3,20 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/cuda_check.hpp>
-#include <nvforest/detail/raft_proto/detail/device_id/base.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/detail/cuda_check.hpp>
+#include <nvforest/detail/device_id/base.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <rmm/cuda_device.hpp>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 template <>
 struct device_id<device_type::gpu> {
   using value_type = typename rmm::cuda_device_id::value_type;
   device_id() noexcept(false)
     : id_{[]() {
         auto raw_id = value_type{};
-        raft_proto::cuda_check(cudaGetDevice(&raw_id));
+        cuda_check(cudaGetDevice(&raw_id));
         return raw_id;
       }()} {};
   device_id(value_type dev_id) noexcept : id_{dev_id} {};
@@ -26,4 +26,4 @@ struct device_id<device_type::gpu> {
  private:
   rmm::cuda_device_id id_;
 };
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/device_initialization.hpp b/cpp/include/nvforest/detail/device_initialization.hpp
index bb1bff6..47a07ea 100644
--- a/cpp/include/nvforest/detail/device_initialization.hpp
+++ b/cpp/include/nvforest/detail/device_initialization.hpp
@@ -13,15 +13,15 @@
 
 namespace nvforest::detail {
 /* Set any required device options for optimizing nvForest compute */
-template <typename forest_t, raft_proto::device_type D>
-void initialize_device(raft_proto::device_id<D> device)
+template <typename forest_t, device_type D>
+void initialize_device(device_id<D> device)
 {
   device_initialization::initialize_device<forest_t>(device);
 }
 
 /* Set any required device options for optimizing nvForest compute */
 template <typename forest_t>
-void initialize_device(raft_proto::device_id_variant device)
+void initialize_device(device_id_variant device)
 {
   std::visit(
     [](auto&& concrete_device) {
diff --git a/cpp/include/nvforest/detail/device_initialization/cpu.hpp b/cpp/include/nvforest/detail/device_initialization/cpu.hpp
index dd2747e..5ae78ff 100644
--- a/cpp/include/nvforest/detail/device_initialization/cpu.hpp
+++ b/cpp/include/nvforest/detail/device_initialization/cpu.hpp
@@ -4,9 +4,9 @@
  */
 #pragma once
 
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/detail/device_id.hpp>
+#include <nvforest/detail/gpu_support.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <type_traits>
 
@@ -17,11 +17,11 @@ namespace nvforest::detail::device_initialization {
  * This specialization will also be used for non-GPU-enabled builds
  * (as a GPU no-op).
  */
-template <typename forest_t, raft_proto::device_type D>
-std::enable_if_t<std::disjunction_v<std::bool_constant<!raft_proto::GPU_ENABLED>,
-                                    std::bool_constant<D == raft_proto::device_type::cpu>>,
-                 void>
-initialize_device(raft_proto::device_id<D> device)
+template <typename forest_t, device_type D>
+std::enable_if_t<
+  std::disjunction_v<std::bool_constant<!GPU_ENABLED>, std::bool_constant<D == device_type::cpu>>,
+  void>
+initialize_device(device_id<D> device)
 {
 }
 
diff --git a/cpp/include/nvforest/detail/device_initialization/gpu.cuh b/cpp/include/nvforest/detail/device_initialization/gpu.cuh
index e95ffea..001c627 100644
--- a/cpp/include/nvforest/detail/device_initialization/gpu.cuh
+++ b/cpp/include/nvforest/detail/device_initialization/gpu.cuh
@@ -5,15 +5,15 @@
 #pragma once
 
 #include <nvforest/constants.hpp>
+#include <nvforest/detail/cuda_check.hpp>
+#include <nvforest/detail/device_id.hpp>
+#include <nvforest/detail/device_setter.hpp>
 #include <nvforest/detail/forest.hpp>
 #include <nvforest/detail/gpu_introspection.hpp>
+#include <nvforest/detail/gpu_support.hpp>
 #include <nvforest/detail/infer_kernel/gpu.cuh>
-#include <nvforest/detail/raft_proto/cuda_check.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_setter.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
 #include <nvforest/detail/specializations/device_initialization_macros.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -27,197 +27,178 @@ namespace nvforest::detail::device_initialization {
  * the inference kernels have access to the maximum available dynamic shared
  * memory.
  */
-template <typename forest_t, raft_proto::device_type D>
-std::enable_if_t<std::conjunction_v<std::bool_constant<raft_proto::GPU_ENABLED>,
-                                    std::bool_constant<D == raft_proto::device_type::gpu>>,
-                 void>
-initialize_device(raft_proto::device_id<D> device)
+template <typename forest_t, device_type D>
+std::enable_if_t<
+  std::conjunction_v<std::bool_constant<GPU_ENABLED>, std::bool_constant<D == device_type::gpu>>,
+  void>
+initialize_device(device_id<D> device)
 {
-  auto device_context           = raft_proto::device_setter(device);
+  auto device_context           = device_setter(device);
   auto max_shared_mem_per_block = get_max_shared_mem_per_block(device);
   // Run solely for side-effect of caching SM count
   get_sm_count(device);
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<false, 1, forest_t, std::nullptr_t, std::nullptr_t>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 2, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 4, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 8, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 16, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<false, 32, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<false, 1, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<false, 2, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<false, 4, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<false, 8, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<false, 16, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<false, 32, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 1, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 2, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 4, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 8, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 16, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(infer_kernel<true, 32, forest_t>,
-                                              cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                              max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true, 1, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true, 2, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true, 4, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true, 8, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true, 16, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true, 32, forest_t, typename forest_t::io_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 1, forest_t, std::nullptr_t, std::nullptr_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 2, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 4, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 8, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 16, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 32, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 1, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 2, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 4, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 8, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 16, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<false, 32, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 1, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 2, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 4, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 8, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 16, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 32, forest_t>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 1, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 2, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 4, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 8, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 16, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true, 32, forest_t, typename forest_t::io_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 1, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 2, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 4, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 8, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 16, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 32, forest_t, std::nullptr_t, typename forest_t::node_type::index_type*>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 1, forest_t, typename forest_t::io_type*, std::nullptr_t>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 2, forest_t, typename forest_t::io_type*, std::nullptr_t>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 4, forest_t, typename forest_t::io_type*, std::nullptr_t>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 8, forest_t, typename forest_t::io_type*, std::nullptr_t>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 16, forest_t, typename forest_t::io_type*, std::nullptr_t>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(cudaFuncSetAttribute(
+  cuda_check(cudaFuncSetAttribute(
     infer_kernel<true, 32, forest_t, typename forest_t::io_type*, std::nullptr_t>,
     cudaFuncAttributeMaxDynamicSharedMemorySize,
     max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true,
-                                      1,
-                                      forest_t,
-                                      typename forest_t::io_type*,
-                                      typename forest_t::node_type::index_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true,
-                                      2,
-                                      forest_t,
-                                      typename forest_t::io_type*,
-                                      typename forest_t::node_type::index_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true,
-                                      4,
-                                      forest_t,
-                                      typename forest_t::io_type*,
-                                      typename forest_t::node_type::index_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true,
-                                      8,
-                                      forest_t,
-                                      typename forest_t::io_type*,
-                                      typename forest_t::node_type::index_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true,
-                                      16,
-                                      forest_t,
-                                      typename forest_t::io_type*,
-                                      typename forest_t::node_type::index_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
-  raft_proto::cuda_check(
-    cudaFuncSetAttribute(infer_kernel<true,
-                                      32,
-                                      forest_t,
-                                      typename forest_t::io_type*,
-                                      typename forest_t::node_type::index_type*>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true,
+                                               1,
+                                               forest_t,
+                                               typename forest_t::io_type*,
+                                               typename forest_t::node_type::index_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true,
+                                               2,
+                                               forest_t,
+                                               typename forest_t::io_type*,
+                                               typename forest_t::node_type::index_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true,
+                                               4,
+                                               forest_t,
+                                               typename forest_t::io_type*,
+                                               typename forest_t::node_type::index_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true,
+                                               8,
+                                               forest_t,
+                                               typename forest_t::io_type*,
+                                               typename forest_t::node_type::index_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true,
+                                               16,
+                                               forest_t,
+                                               typename forest_t::io_type*,
+                                               typename forest_t::node_type::index_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
+  cuda_check(cudaFuncSetAttribute(infer_kernel<true,
+                                               32,
+                                               forest_t,
+                                               typename forest_t::io_type*,
+                                               typename forest_t::node_type::index_type*>,
+                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                  max_shared_mem_per_block));
 }
 
 NVFOREST_INITIALIZE_DEVICE(extern template, 0)
diff --git a/cpp/include/nvforest/detail/device_initialization/gpu.hpp b/cpp/include/nvforest/detail/device_initialization/gpu.hpp
index 431777a..5593fd4 100644
--- a/cpp/include/nvforest/detail/device_initialization/gpu.hpp
+++ b/cpp/include/nvforest/detail/device_initialization/gpu.hpp
@@ -4,10 +4,9 @@
  */
 #pragma once
 
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_setter.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/detail/device_id.hpp>
+#include <nvforest/detail/gpu_support.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <type_traits>
 
@@ -16,10 +15,10 @@ namespace nvforest::detail::device_initialization {
 /* Non-CUDA header declaration of the GPU specialization for device
  * initialization
  */
-template <typename forest_t, raft_proto::device_type D>
-std::enable_if_t<std::conjunction_v<std::bool_constant<raft_proto::GPU_ENABLED>,
-                                    std::bool_constant<D == raft_proto::device_type::gpu>>,
-                 void>
-initialize_device(raft_proto::device_id<D> device);
+template <typename forest_t, device_type D>
+std::enable_if_t<
+  std::conjunction_v<std::bool_constant<GPU_ENABLED>, std::bool_constant<D == device_type::gpu>>,
+  void>
+initialize_device(device_id<D> device);
 
 }  // namespace nvforest::detail::device_initialization
diff --git a/cpp/include/nvforest/detail/device_setter.hpp b/cpp/include/nvforest/detail/device_setter.hpp
new file mode 100644
index 0000000..6a9c162
--- /dev/null
+++ b/cpp/include/nvforest/detail/device_setter.hpp
@@ -0,0 +1,10 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include <nvforest/detail/device_setter/base.hpp>
+#ifdef NVFOREST_ENABLE_GPU
+#include <nvforest/detail/device_setter/gpu.hpp>
+#endif
+#include <nvforest/device_type.hpp>
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_setter/base.hpp b/cpp/include/nvforest/detail/device_setter/base.hpp
similarity index 62%
rename from cpp/include/nvforest/detail/raft_proto/detail/device_setter/base.hpp
rename to cpp/include/nvforest/detail/device_setter/base.hpp
index 1fe46d8..069c62d 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/device_setter/base.hpp
+++ b/cpp/include/nvforest/detail/device_setter/base.hpp
@@ -3,10 +3,10 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/detail/device_id.hpp>
+#include <nvforest/device_type.hpp>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 
 /** Struct for setting current device within a code block */
 template <device_type D>
@@ -14,4 +14,4 @@ struct device_setter {
   device_setter(device_id<D> device) {}
 };
 
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/device_setter/gpu.hpp b/cpp/include/nvforest/detail/device_setter/gpu.hpp
similarity index 53%
rename from cpp/include/nvforest/detail/raft_proto/detail/device_setter/gpu.hpp
rename to cpp/include/nvforest/detail/device_setter/gpu.hpp
index a256538..40564fd 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/device_setter/gpu.hpp
+++ b/cpp/include/nvforest/detail/device_setter/gpu.hpp
@@ -3,28 +3,28 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/cuda_check.hpp>
-#include <nvforest/detail/raft_proto/detail/device_setter/base.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/detail/cuda_check.hpp>
+#include <nvforest/detail/device_id.hpp>
+#include <nvforest/detail/device_setter/base.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <raft/util/cudart_utils.hpp>
 
 #include <cuda_runtime_api.h>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 
 /** Struct for setting current device within a code block */
 template <>
 struct device_setter<device_type::gpu> {
-  device_setter(raft_proto::device_id<device_type::gpu> device) noexcept(false)
+  device_setter(device_id<device_type::gpu> device) noexcept(false)
     : prev_device_{[]() {
         auto result = int{};
-        raft_proto::cuda_check(cudaGetDevice(&result));
+        cuda_check(cudaGetDevice(&result));
         return result;
       }()}
   {
-    raft_proto::cuda_check(cudaSetDevice(device.value()));
+    cuda_check(cudaSetDevice(device.value()));
   }
 
   ~device_setter() { RAFT_CUDA_TRY_NO_THROW(cudaSetDevice(prev_device_.value())); }
@@ -33,4 +33,4 @@ struct device_setter<device_type::gpu> {
   device_id<device_type::gpu> prev_device_;
 };
 
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/evaluate_tree.hpp b/cpp/include/nvforest/detail/evaluate_tree.hpp
index cd23b26..4b3d0bd 100644
--- a/cpp/include/nvforest/detail/evaluate_tree.hpp
+++ b/cpp/include/nvforest/detail/evaluate_tree.hpp
@@ -10,7 +10,7 @@
 #include <math.h>
 #endif
 #include <nvforest/detail/bitset.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/detail/gpu_support.hpp>
 namespace nvforest::detail {
 
 /*
diff --git a/cpp/include/nvforest/detail/raft_proto/exceptions.hpp b/cpp/include/nvforest/detail/exceptions.hpp
similarity index 67%
rename from cpp/include/nvforest/detail/raft_proto/exceptions.hpp
rename to cpp/include/nvforest/detail/exceptions.hpp
index 9c9ab27..eebefb3 100644
--- a/cpp/include/nvforest/detail/raft_proto/exceptions.hpp
+++ b/cpp/include/nvforest/detail/exceptions.hpp
@@ -1,22 +1,11 @@
 /*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 #include <exception>
 
-namespace raft_proto {
+namespace nvforest::detail {
 struct bad_cuda_call : std::exception {
   bad_cuda_call() : bad_cuda_call("CUDA API call failed") {}
   bad_cuda_call(char const* msg) : msg_{msg} {}
@@ -64,4 +53,4 @@ struct wrong_device : std::exception {
   char const* msg_;
 };
 
-}  // namespace raft_proto
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/forest.hpp b/cpp/include/nvforest/detail/forest.hpp
index a6c141b..6ad4c86 100644
--- a/cpp/include/nvforest/detail/forest.hpp
+++ b/cpp/include/nvforest/detail/forest.hpp
@@ -3,9 +3,9 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include <nvforest/detail/gpu_support.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/node.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
 
 #include <stddef.h>
 
diff --git a/cpp/include/nvforest/detail/gpu_introspection.hpp b/cpp/include/nvforest/detail/gpu_introspection.hpp
index 2c8537f..891bff0 100644
--- a/cpp/include/nvforest/detail/gpu_introspection.hpp
+++ b/cpp/include/nvforest/detail/gpu_introspection.hpp
@@ -3,10 +3,10 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include <nvforest/detail/cuda_check.hpp>
+#include <nvforest/detail/device_id.hpp>
 #include <nvforest/detail/index_type.hpp>
-#include <nvforest/detail/raft_proto/cuda_check.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -14,72 +14,69 @@
 
 namespace nvforest::detail {
 
-inline auto get_max_shared_mem_per_block(
-  raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+inline auto get_max_shared_mem_per_block(device_id<nvforest::device_type::gpu> device_id)
 {
   auto thread_local cache = std::vector<int>{};
   if (cache.size() == 0) {
     auto device_count = int{};
-    raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
+    cuda_check(cudaGetDeviceCount(&device_count));
     cache.resize(device_count);
     for (auto dev = 0; dev < device_count; ++dev) {
-      raft_proto::cuda_check(
+      cuda_check(
         cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev));
     }
   }
   return index_type(cache.at(device_id.value()));
 }
 
-inline auto get_sm_count(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+inline auto get_sm_count(device_id<nvforest::device_type::gpu> device_id)
 {
   auto thread_local cache = std::vector<int>{};
   if (cache.size() == 0) {
     auto device_count = int{};
-    raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
+    cuda_check(cudaGetDeviceCount(&device_count));
     cache.resize(device_count);
     for (auto dev = 0; dev < device_count; ++dev) {
-      raft_proto::cuda_check(
-        cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
+      cuda_check(cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
     }
   }
   return index_type(cache.at(device_id.value()));
 }
 
-inline auto get_max_threads_per_sm(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+inline auto get_max_threads_per_sm(device_id<nvforest::device_type::gpu> device_id)
 {
   auto result = int{};
-  raft_proto::cuda_check(
+  cuda_check(
     cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor, device_id.value()));
   return index_type(result);
 }
 
-inline auto get_max_shared_mem_per_sm(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+inline auto get_max_shared_mem_per_sm(device_id<nvforest::device_type::gpu> device_id)
 {
   auto thread_local cache = std::vector<int>{};
   if (cache.size() == 0) {
     auto device_count = int{};
-    raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
+    cuda_check(cudaGetDeviceCount(&device_count));
     cache.resize(device_count);
     for (auto dev = 0; dev < device_count; ++dev) {
-      raft_proto::cuda_check(
+      cuda_check(
         cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev));
     }
   }
   return index_type(cache.at(device_id.value()));
 }
 
-inline auto get_mem_clock_rate(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+inline auto get_mem_clock_rate(device_id<nvforest::device_type::gpu> device_id)
 {
   auto result = int{};
-  raft_proto::cuda_check(
-    cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate, device_id.value()));
+  cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate, device_id.value()));
   return index_type(result);
 }
 
-inline auto get_core_clock_rate(raft_proto::device_id<raft_proto::device_type::gpu> device_id)
+inline auto get_core_clock_rate(device_id<nvforest::device_type::gpu> device_id)
 {
   auto result = int{};
-  raft_proto::cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrClockRate, device_id.value()));
+  cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrClockRate, device_id.value()));
   return index_type(result);
 }
 
diff --git a/cpp/include/nvforest/detail/raft_proto/gpu_support.hpp b/cpp/include/nvforest/detail/gpu_support.hpp
similarity index 94%
rename from cpp/include/nvforest/detail/raft_proto/gpu_support.hpp
rename to cpp/include/nvforest/detail/gpu_support.hpp
index 6d03363..9994632 100644
--- a/cpp/include/nvforest/detail/raft_proto/gpu_support.hpp
+++ b/cpp/include/nvforest/detail/gpu_support.hpp
@@ -8,7 +8,7 @@
 #include <cstddef>
 #include <exception>
 
-namespace raft_proto {
+namespace nvforest::detail {
 #ifdef NVFOREST_ENABLE_GPU
 auto constexpr static const GPU_ENABLED = true;
 #else
@@ -42,4 +42,4 @@ struct gpu_unsupported : std::exception {
   char const* msg_;
 };
 
-}  // namespace raft_proto
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/host_only_throw.hpp b/cpp/include/nvforest/detail/host_only_throw.hpp
new file mode 100644
index 0000000..beb4e34
--- /dev/null
+++ b/cpp/include/nvforest/detail/host_only_throw.hpp
@@ -0,0 +1,8 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include <nvforest/detail/gpu_support.hpp>
+#include <nvforest/detail/host_only_throw/base.hpp>
+#include <nvforest/detail/host_only_throw/cpu.hpp>
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/base.hpp b/cpp/include/nvforest/detail/host_only_throw/base.hpp
similarity index 66%
rename from cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/base.hpp
rename to cpp/include/nvforest/detail/host_only_throw/base.hpp
index b6c81c5..607eac6 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/base.hpp
+++ b/cpp/include/nvforest/detail/host_only_throw/base.hpp
@@ -3,10 +3,10 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/detail/gpu_support.hpp>
 
-namespace raft_proto::detail {
-template <typename T, bool host>
+namespace nvforest::detail {
+template <typename T, bool host = !GPU_COMPILATION>
 struct host_only_throw {
   template <typename... Args>
   host_only_throw(Args&&... args)
@@ -14,4 +14,4 @@ struct host_only_throw {
     static_assert(host);  // Do not allow constexpr branch to compile if !host
   }
 };
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/cpu.hpp b/cpp/include/nvforest/detail/host_only_throw/cpu.hpp
similarity index 62%
rename from cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/cpu.hpp
rename to cpp/include/nvforest/detail/host_only_throw/cpu.hpp
index 3dad51f..d6b893e 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw/cpu.hpp
+++ b/cpp/include/nvforest/detail/host_only_throw/cpu.hpp
@@ -3,10 +3,10 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/detail/host_only_throw/base.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/detail/gpu_support.hpp>
+#include <nvforest/detail/host_only_throw/base.hpp>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 template <typename T>
 struct host_only_throw<T, true> {
   template <typename... Args>
@@ -15,4 +15,4 @@ struct host_only_throw<T, true> {
     throw T{std::forward<Args>(args)...};
   }
 };
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/infer.hpp b/cpp/include/nvforest/detail/infer.hpp
index d4005fd..553105d 100644
--- a/cpp/include/nvforest/detail/infer.hpp
+++ b/cpp/include/nvforest/detail/infer.hpp
@@ -3,12 +3,12 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/device_id.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/postprocessor.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/device_type.hpp>
 #include <nvforest/exceptions.hpp>
 #include <nvforest/infer_kind.hpp>
 
@@ -51,7 +51,7 @@ namespace nvforest::detail {
  * @param device The device on which to execute evaluation
  * @param stream Optionally, the CUDA stream to use
  */
-template <raft_proto::device_type D, typename forest_t>
+template <device_type D, typename forest_t>
 void infer(forest_t const& forest,
            postprocessor<typename forest_t::io_type> const& postproc,
            typename forest_t::io_type* output,
@@ -64,8 +64,8 @@ void infer(forest_t const& forest,
            typename forest_t::node_type::index_type* categorical_data = nullptr,
            infer_kind infer_type                                      = infer_kind::default_kind,
            std::optional<index_type> specified_chunk_size             = std::nullopt,
-           raft_proto::device_id<D> device                            = raft_proto::device_id<D>{},
-           raft_proto::cuda_stream stream                             = raft_proto::cuda_stream{})
+           device_id<D> device                                        = device_id<D>{},
+           cuda_stream stream                                         = cuda_stream{})
 {
   if (vector_output == nullptr) {
     if (categorical_data == nullptr) {
diff --git a/cpp/include/nvforest/detail/infer/cpu.hpp b/cpp/include/nvforest/detail/infer/cpu.hpp
index b3142d1..5e1f8c7 100644
--- a/cpp/include/nvforest/detail/infer/cpu.hpp
+++ b/cpp/include/nvforest/detail/infer/cpu.hpp
@@ -5,16 +5,16 @@
 #pragma once
 
 #include <nvforest/constants.hpp>
+#include <nvforest/cuda_stream.hpp>
 #include <nvforest/detail/cpu_introspection.hpp>
+#include <nvforest/detail/device_id.hpp>
 #include <nvforest/detail/forest.hpp>
+#include <nvforest/detail/gpu_support.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/infer_kernel/cpu.hpp>
 #include <nvforest/detail/postprocessor.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
+#include <nvforest/device_type.hpp>
 #include <nvforest/infer_kind.hpp>
 
 #include <cstddef>
@@ -62,14 +62,14 @@ namespace nvforest::detail::inference {
  * (for individual row inference) to 512 (for very large batch
  * inference). A value of 64 is a generally-useful default.
  */
-template <raft_proto::device_type D,
+template <device_type D,
           bool has_categorical_nodes,
           typename forest_t,
           typename vector_output_t    = std::nullptr_t,
           typename categorical_data_t = std::nullptr_t>
-std::enable_if_t<std::disjunction_v<std::bool_constant<D == raft_proto::device_type::cpu>,
-                                    std::bool_constant<!raft_proto::GPU_ENABLED>>,
-                 void>
+std::enable_if_t<
+  std::disjunction_v<std::bool_constant<D == device_type::cpu>, std::bool_constant<!GPU_ENABLED>>,
+  void>
 infer(forest_t const& forest,
       postprocessor<typename forest_t::io_type> const& postproc,
       typename forest_t::io_type* output,
@@ -81,11 +81,11 @@ infer(forest_t const& forest,
       categorical_data_t categorical_data            = nullptr,
       infer_kind infer_type                          = infer_kind::default_kind,
       std::optional<index_type> specified_chunk_size = std::nullopt,
-      raft_proto::device_id<D> device                = raft_proto::device_id<D>{},
-      raft_proto::cuda_stream                        = raft_proto::cuda_stream{})
+      device_id<D> device                            = device_id<D>{},
+      cuda_stream                                    = cuda_stream{})
 {
-  if constexpr (D == raft_proto::device_type::gpu) {
-    throw raft_proto::gpu_unsupported("Tried to use GPU inference in CPU-only build");
+  if constexpr (D == device_type::gpu) {
+    throw gpu_unsupported("Tried to use GPU inference in CPU-only build");
   } else {
     if (infer_type == infer_kind::leaf_id) {
       infer_kernel_cpu<has_categorical_nodes, true>(
@@ -124,17 +124,17 @@ infer(forest_t const& forest,
  * compiled as few times as possible. A macro is used because ever
  * specialization must be explicitly declared. The final argument to the macro
  * references the 8 specialization variants compiled in standard nvForest. */
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 0)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 1)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 2)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 3)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 4)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 5)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 6)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 7)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 8)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 9)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 10)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::cpu, 11)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 0)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 1)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 2)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 3)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 4)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 5)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 6)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 7)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 8)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 9)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 10)
+NVFOREST_INFER_ALL(extern template, device_type::cpu, 11)
 
 }  // namespace nvforest::detail::inference
diff --git a/cpp/include/nvforest/detail/infer/gpu.cuh b/cpp/include/nvforest/detail/infer/gpu.cuh
index def3f8d..16b8f52 100644
--- a/cpp/include/nvforest/detail/infer/gpu.cuh
+++ b/cpp/include/nvforest/detail/infer/gpu.cuh
@@ -3,21 +3,21 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include <nvforest/buffer.hpp>
 #include <nvforest/constants.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/ceildiv.hpp>
+#include <nvforest/detail/cuda_check.hpp>
+#include <nvforest/detail/device_id.hpp>
 #include <nvforest/detail/forest.hpp>
 #include <nvforest/detail/gpu_introspection.hpp>
+#include <nvforest/detail/gpu_support.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/infer_kernel/gpu.cuh>
+#include <nvforest/detail/padding.hpp>
 #include <nvforest/detail/postprocessor.hpp>
-#include <nvforest/detail/raft_proto/buffer.hpp>
-#include <nvforest/detail/raft_proto/ceildiv.hpp>
-#include <nvforest/detail/raft_proto/cuda_check.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
-#include <nvforest/detail/raft_proto/padding.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
+#include <nvforest/device_type.hpp>
 #include <nvforest/exceptions.hpp>
 #include <nvforest/infer_kind.hpp>
 
@@ -36,7 +36,7 @@ inline auto compute_output_size(index_type row_output_size,
 {
   auto result = row_output_size * rows_per_block_iteration;
   if (infer_type == infer_kind::default_kind) {
-    result *= raft_proto::ceildiv(threads_per_block, rows_per_block_iteration);
+    result *= ceildiv(threads_per_block, rows_per_block_iteration);
   }
   return result;
 }
@@ -77,12 +77,12 @@ inline auto compute_output_size(index_type row_output_size,
  * value depends on hardware, model, and batch size. Valid values are any power
  * of 2 from 1 to 32.
  */
-template <raft_proto::device_type D,
+template <device_type D,
           bool has_categorical_nodes,
           typename forest_t,
           typename vector_output_t    = std::nullptr_t,
           typename categorical_data_t = std::nullptr_t>
-std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
+std::enable_if_t<D == device_type::gpu, void> infer(
   forest_t const& forest,
   postprocessor<typename forest_t::io_type> const& postproc,
   typename forest_t::io_type* output,
@@ -94,8 +94,8 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
   categorical_data_t categorical_data            = nullptr,
   infer_kind infer_type                          = infer_kind::default_kind,
   std::optional<index_type> specified_chunk_size = std::nullopt,
-  raft_proto::device_id<D> device                = raft_proto::device_id<D>{},
-  raft_proto::cuda_stream stream                 = raft_proto::cuda_stream{})
+  device_id<D> device                            = device_id<D>{},
+  cuda_stream stream                             = cuda_stream{})
 {
   using output_t = typename forest_t::template raw_output_type<vector_output_t>;
 
@@ -114,15 +114,15 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
   // block.
   auto threads_per_block =
     min(MAX_THREADS_PER_BLOCK,
-        raft_proto::downpadded_size(
-          (max_shared_mem_per_block - row_size_bytes) / row_output_size_bytes, WARP_SIZE));
+        downpadded_size((max_shared_mem_per_block - row_size_bytes) / row_output_size_bytes,
+                        WARP_SIZE));
 
   // If we cannot do at least a warp per block when storing input rows in
   // shared mem, recalculate our threads per block without input storage
   if (threads_per_block < WARP_SIZE) {
     threads_per_block =
       min(MAX_THREADS_PER_BLOCK,
-          raft_proto::downpadded_size(max_shared_mem_per_block / row_output_size_bytes, WARP_SIZE));
+          downpadded_size(max_shared_mem_per_block / row_output_size_bytes, WARP_SIZE));
     if (threads_per_block >= WARP_SIZE) {
       row_size_bytes = index_type{};  // Do not store input rows in shared mem
     }
@@ -131,9 +131,8 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
   // If we cannot do at least a warp per block when storing output in
   // shared mem, recalculate our threads per block with ONLY input storage
   if (threads_per_block < WARP_SIZE) {
-    threads_per_block =
-      min(MAX_THREADS_PER_BLOCK,
-          raft_proto::downpadded_size(max_shared_mem_per_block / row_size_bytes, WARP_SIZE));
+    threads_per_block = min(MAX_THREADS_PER_BLOCK,
+                            downpadded_size(max_shared_mem_per_block / row_size_bytes, WARP_SIZE));
   }
 
   // If we still cannot use at least a warp per block, give up on using
@@ -149,7 +148,7 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
   auto output_workspace_size =
     compute_output_size(row_output_size, threads_per_block, rows_per_block_iteration, infer_type);
   auto output_workspace_size_bytes = output_item_bytes * output_workspace_size;
-  auto global_workspace            = raft_proto::buffer<output_t>{};
+  auto global_workspace            = buffer<output_t>{};
 
   if (output_workspace_size_bytes > max_shared_mem_per_block) {
     output_workspace_size_bytes = 0;
@@ -160,7 +159,7 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
         max_overall_shared_mem);
 
   auto resident_blocks_per_sm =
-    min(raft_proto::ceildiv(max_shared_mem_per_sm, shared_mem_per_block), max_resident_blocks);
+    min(ceildiv(max_shared_mem_per_sm, shared_mem_per_block), max_resident_blocks);
 
   // If caller has not specified the number of rows per block iteration, apply
   // the following heuristic to identify an approximately optimal value
@@ -188,10 +187,10 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
   shared_mem_per_block = std::min(
     max_overall_shared_mem, max_shared_mem_per_sm / (max_shared_mem_per_sm / shared_mem_per_block));
 
-  auto num_blocks = std::min(raft_proto::ceildiv(row_count, rows_per_block_iteration), MAX_BLOCKS);
+  auto num_blocks = std::min(ceildiv(row_count, rows_per_block_iteration), MAX_BLOCKS);
   if (row_output_size == 0) {
-    global_workspace = raft_proto::buffer<output_t>{
-      output_workspace_size * num_blocks, raft_proto::device_type::gpu, device.value(), stream};
+    global_workspace = buffer<output_t>{
+      output_workspace_size * num_blocks, device_type::gpu, device.value(), stream};
   }
 
   /**
@@ -209,8 +208,8 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
     auto num_grove  = [infer_type, threads_per_block, task_count, chunk_size]() {
       auto result = std::uint64_t{1};
       if (infer_type == infer_kind::default_kind) {
-        result = raft_proto::ceildiv(min(static_cast<std::uint64_t>(threads_per_block), task_count),
-                                     chunk_size);
+        result =
+          ceildiv(min(static_cast<std::uint64_t>(threads_per_block), task_count), chunk_size);
       }
       return result;
     }();
@@ -318,7 +317,7 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
                                                                         infer_type,
                                                                         global_workspace.data());
   }
-  raft_proto::cuda_check(cudaGetLastError());
+  cuda_check(cudaGetLastError());
 }
 
 /* This macro is invoked here to declare all standard specializations of this
@@ -326,13 +325,13 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
  * compiled as few times as possible. A macro is used because ever
  * specialization must be explicitly declared. The final argument to the macro
  * references the 8 specialization variants compiled in nvForest. */
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 0)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 1)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 2)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 3)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 4)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 5)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 6)
-NVFOREST_INFER_ALL(extern template, raft_proto::device_type::gpu, 7)
+NVFOREST_INFER_ALL(extern template, device_type::gpu, 0)
+NVFOREST_INFER_ALL(extern template, device_type::gpu, 1)
+NVFOREST_INFER_ALL(extern template, device_type::gpu, 2)
+NVFOREST_INFER_ALL(extern template, device_type::gpu, 3)
+NVFOREST_INFER_ALL(extern template, device_type::gpu, 4)
+NVFOREST_INFER_ALL(extern template, device_type::gpu, 5)
+NVFOREST_INFER_ALL(extern template, device_type::gpu, 6)
+NVFOREST_INFER_ALL(extern template, device_type::gpu, 7)
 
 }  // namespace nvforest::detail::inference
diff --git a/cpp/include/nvforest/detail/infer/gpu.hpp b/cpp/include/nvforest/detail/infer/gpu.hpp
index b68fdf4..5de10d4 100644
--- a/cpp/include/nvforest/detail/infer/gpu.hpp
+++ b/cpp/include/nvforest/detail/infer/gpu.hpp
@@ -2,12 +2,12 @@
  * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/device_id.hpp>
 #include <nvforest/detail/forest.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/postprocessor.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/device_type.hpp>
 #include <nvforest/infer_kind.hpp>
 
 #include <cstddef>
@@ -16,12 +16,12 @@
 namespace nvforest::detail::inference {
 
 /* The CUDA-free header declaration of the GPU infer template */
-template <raft_proto::device_type D,
+template <device_type D,
           bool has_categorical_nodes,
           typename forest_t,
           typename vector_output_t    = std::nullptr_t,
           typename categorical_data_t = std::nullptr_t>
-std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
+std::enable_if_t<D == device_type::gpu, void> infer(
   forest_t const& forest,
   postprocessor<typename forest_t::io_type> const& postproc,
   typename forest_t::io_type* output,
@@ -33,7 +33,7 @@ std::enable_if_t<D == raft_proto::device_type::gpu, void> infer(
   categorical_data_t categorical_data            = nullptr,
   infer_kind infer_type                          = infer_kind::default_kind,
   std::optional<index_type> specified_chunk_size = std::nullopt,
-  raft_proto::device_id<D> device                = raft_proto::device_id<D>{},
-  raft_proto::cuda_stream stream                 = raft_proto::cuda_stream{});
+  device_id<D> device                            = device_id<D>{},
+  cuda_stream stream                             = cuda_stream{});
 
 }  // namespace nvforest::detail::inference
diff --git a/cpp/include/nvforest/detail/infer_kernel/cpu.hpp b/cpp/include/nvforest/detail/infer_kernel/cpu.hpp
index b3db750..5c82d26 100644
--- a/cpp/include/nvforest/detail/infer_kernel/cpu.hpp
+++ b/cpp/include/nvforest/detail/infer_kernel/cpu.hpp
@@ -3,11 +3,11 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include <nvforest/detail/ceildiv.hpp>
 #include <nvforest/detail/cpu_introspection.hpp>
 #include <nvforest/detail/evaluate_tree.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/postprocessor.hpp>
-#include <nvforest/detail/raft_proto/ceildiv.hpp>
 #include <nvforest/exceptions.hpp>
 #include <nvforest/infer_kind.hpp>
 
@@ -94,8 +94,8 @@ void infer_kernel_cpu(forest_t const& forest,
   using output_t = typename forest_t::template raw_output_type<vector_output_t>;
 
   auto const num_tree  = forest.tree_count();
-  auto const num_grove = raft_proto::ceildiv(num_tree, grove_size);
-  auto const num_chunk = raft_proto::ceildiv(row_count, chunk_size);
+  auto const num_grove = ceildiv(num_tree, grove_size);
+  auto const num_chunk = ceildiv(row_count, chunk_size);
 
   /**
    * Throw an error for large inputs that would cause integer overflow.
diff --git a/cpp/include/nvforest/detail/infer_kernel/gpu.cuh b/cpp/include/nvforest/detail/infer_kernel/gpu.cuh
index 2e11809..b221293 100644
--- a/cpp/include/nvforest/detail/infer_kernel/gpu.cuh
+++ b/cpp/include/nvforest/detail/infer_kernel/gpu.cuh
@@ -3,13 +3,13 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include <nvforest/detail/ceildiv.hpp>
 #include <nvforest/detail/evaluate_tree.hpp>
 #include <nvforest/detail/gpu_introspection.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/infer_kernel/shared_memory_buffer.cuh>
+#include <nvforest/detail/padding.hpp>
 #include <nvforest/detail/postprocessor.hpp>
-#include <nvforest/detail/raft_proto/ceildiv.hpp>
-#include <nvforest/detail/raft_proto/padding.hpp>
 #include <nvforest/detail/utils.hpp>
 #include <nvforest/infer_kind.hpp>
 
@@ -109,7 +109,7 @@ NVFOREST_KERNEL void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
 
     auto task_count = chunk_size * forest.tree_count();
 
-    auto num_grove = raft_proto::ceildiv(min(index_type(blockDim.x), task_count), chunk_size) *
+    auto num_grove = ceildiv(min(index_type(blockDim.x), task_count), chunk_size) *
                        (infer_type == infer_kind::default_kind) +
                      (infer_type != infer_kind::default_kind);
 
@@ -121,7 +121,7 @@ NVFOREST_KERNEL void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
     // deadlock on __syncthreads, so we round the task_count up to the next
     // multiple of the number of threads in this block. We then only perform
     // work within the loop if the task_index is below the actual task_count.
-    auto const task_count_rounded_up = blockDim.x * raft_proto::ceildiv(task_count, blockDim.x);
+    auto const task_count_rounded_up = blockDim.x * ceildiv(task_count, blockDim.x);
 
     // Infer on each tree and row
     for (auto task_index = threadIdx.x; task_index < task_count_rounded_up;
@@ -175,7 +175,7 @@ NVFOREST_KERNEL void __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
       __syncthreads();
     }
 
-    auto padded_num_groves = raft_proto::padded_size(num_grove, WARP_SIZE);
+    auto padded_num_groves = padded_size(num_grove, WARP_SIZE);
     for (auto row_index = threadIdx.x / WARP_SIZE; row_index < rows_in_this_iteration;
          row_index += blockDim.x / WARP_SIZE) {
       for (auto class_index = index_type{}; class_index < num_outputs; ++class_index) {
diff --git a/cpp/include/nvforest/detail/node.hpp b/cpp/include/nvforest/detail/node.hpp
index 5cdf73f..f3b97d5 100644
--- a/cpp/include/nvforest/detail/node.hpp
+++ b/cpp/include/nvforest/detail/node.hpp
@@ -4,8 +4,8 @@
  */
 #pragma once
 
+#include <nvforest/detail/gpu_support.hpp>
 #include <nvforest/detail/index_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
 #include <nvforest/exceptions.hpp>
 #include <nvforest/tree_layout.hpp>
 
diff --git a/cpp/include/nvforest/detail/non_owning_buffer.hpp b/cpp/include/nvforest/detail/non_owning_buffer.hpp
new file mode 100644
index 0000000..f552e35
--- /dev/null
+++ b/cpp/include/nvforest/detail/non_owning_buffer.hpp
@@ -0,0 +1,7 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include <nvforest/detail/non_owning_buffer/base.hpp>
+#include <nvforest/device_type.hpp>
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer/base.hpp b/cpp/include/nvforest/detail/non_owning_buffer/base.hpp
similarity index 82%
rename from cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer/base.hpp
rename to cpp/include/nvforest/detail/non_owning_buffer/base.hpp
index ae1ce76..f851d7c 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer/base.hpp
+++ b/cpp/include/nvforest/detail/non_owning_buffer/base.hpp
@@ -3,12 +3,12 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <memory>
 #include <type_traits>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 template <device_type D, typename T>
 struct non_owning_buffer {
   // TODO(wphicks): Assess need for buffers of const T
@@ -23,4 +23,4 @@ struct non_owning_buffer {
   // TODO(wphicks): Back this with RMM-allocated host memory
   T* data_;
 };
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/owning_buffer.hpp b/cpp/include/nvforest/detail/owning_buffer.hpp
new file mode 100644
index 0000000..90f1d13
--- /dev/null
+++ b/cpp/include/nvforest/detail/owning_buffer.hpp
@@ -0,0 +1,10 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+#include <nvforest/detail/owning_buffer/cpu.hpp>
+#include <nvforest/device_type.hpp>
+#ifdef NVFOREST_ENABLE_GPU
+#include <nvforest/detail/owning_buffer/gpu.hpp>
+#endif
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/base.hpp b/cpp/include/nvforest/detail/owning_buffer/base.hpp
similarity index 62%
rename from cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/base.hpp
rename to cpp/include/nvforest/detail/owning_buffer/base.hpp
index b4d02d1..aa5f24b 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/base.hpp
+++ b/cpp/include/nvforest/detail/owning_buffer/base.hpp
@@ -3,13 +3,13 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/device_id.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <type_traits>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 
 template <device_type D, typename T>
 struct owning_buffer {
@@ -18,4 +18,4 @@ struct owning_buffer {
   auto* get() const { return static_cast<T*>(nullptr); }
 };
 
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/cpu.hpp b/cpp/include/nvforest/detail/owning_buffer/cpu.hpp
similarity index 72%
rename from cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/cpu.hpp
rename to cpp/include/nvforest/detail/owning_buffer/cpu.hpp
index d7be3e3..cc54462 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/cpu.hpp
+++ b/cpp/include/nvforest/detail/owning_buffer/cpu.hpp
@@ -3,14 +3,14 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/detail/owning_buffer/base.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/detail/device_id.hpp>
+#include <nvforest/detail/owning_buffer/base.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <memory>
 #include <type_traits>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 template <typename T>
 struct owning_buffer<device_type::cpu, T> {
   // TODO(wphicks): Assess need for buffers of const T
@@ -26,4 +26,4 @@ struct owning_buffer<device_type::cpu, T> {
   // TODO(wphicks): Back this with RMM-allocated host memory
   std::unique_ptr<T[]> data_;
 };
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/gpu.hpp b/cpp/include/nvforest/detail/owning_buffer/gpu.hpp
similarity index 75%
rename from cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/gpu.hpp
rename to cpp/include/nvforest/detail/owning_buffer/gpu.hpp
index 01bc57a..8419452 100644
--- a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer/gpu.hpp
+++ b/cpp/include/nvforest/detail/owning_buffer/gpu.hpp
@@ -3,10 +3,10 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/detail/owning_buffer/base.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_setter.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/detail/device_id.hpp>
+#include <nvforest/detail/device_setter.hpp>
+#include <nvforest/detail/owning_buffer/base.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <rmm/device_buffer.hpp>
 
@@ -14,7 +14,7 @@
 
 #include <type_traits>
 
-namespace raft_proto::detail {
+namespace nvforest::detail {
 template <typename T>
 struct owning_buffer<device_type::gpu, T> {
   // TODO(wphicks): Assess need for buffers of const T
@@ -36,4 +36,4 @@ struct owning_buffer<device_type::gpu, T> {
  private:
   mutable rmm::device_buffer data_;
 };
-}  // namespace raft_proto::detail
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/raft_proto/padding.hpp b/cpp/include/nvforest/detail/padding.hpp
similarity index 91%
rename from cpp/include/nvforest/detail/raft_proto/padding.hpp
rename to cpp/include/nvforest/detail/padding.hpp
index 2ceb32f..8419cfc 100644
--- a/cpp/include/nvforest/detail/raft_proto/padding.hpp
+++ b/cpp/include/nvforest/detail/padding.hpp
@@ -3,9 +3,9 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
+#include <nvforest/detail/gpu_support.hpp>
 
-namespace raft_proto {
+namespace nvforest::detail {
 
 /* Return the value that must be added to val to equal the next multiple of
  * alignment greater than or equal to val */
@@ -45,4 +45,4 @@ HOST DEVICE auto downpadded_size(T val, U alignment)
   return val - downpadding_size(val, alignment);
 }
 
-}  // namespace raft_proto
+}  // namespace nvforest::detail
diff --git a/cpp/include/nvforest/detail/postprocessor.hpp b/cpp/include/nvforest/detail/postprocessor.hpp
index cdf24ca..9b7cb84 100644
--- a/cpp/include/nvforest/detail/postprocessor.hpp
+++ b/cpp/include/nvforest/detail/postprocessor.hpp
@@ -4,8 +4,8 @@
  */
 #pragma once
 
+#include <nvforest/detail/gpu_support.hpp>
 #include <nvforest/detail/index_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
 #include <nvforest/infer_kind.hpp>
 #include <nvforest/postproc_ops.hpp>
 
diff --git a/cpp/include/nvforest/detail/raft_proto/cuda_check.hpp b/cpp/include/nvforest/detail/raft_proto/cuda_check.hpp
deleted file mode 100644
index ab294dc..0000000
--- a/cpp/include/nvforest/detail/raft_proto/cuda_check.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-#include <nvforest/detail/raft_proto/detail/cuda_check/base.hpp>
-#ifdef NVFOREST_ENABLE_GPU
-#include <nvforest/detail/raft_proto/detail/cuda_check/gpu.hpp>
-#endif
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
-
-namespace raft_proto {
-template <typename error_t>
-void cuda_check(error_t const& err) noexcept(!GPU_ENABLED)
-{
-  detail::cuda_check<device_type::gpu>(err);
-}
-}  // namespace raft_proto
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/const_agnostic.hpp b/cpp/include/nvforest/detail/raft_proto/detail/const_agnostic.hpp
deleted file mode 100644
index 6d61711..0000000
--- a/cpp/include/nvforest/detail/raft_proto/detail/const_agnostic.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-#include <type_traits>
-
-namespace raft_proto {
-template <typename T, typename U, typename V = void>
-using const_agnostic_same_t =
-  std::enable_if_t<std::is_same_v<std::remove_const_t<T>, std::remove_const_t<U>>, V>;
-
-template <typename T, typename U>
-inline constexpr auto const_agnostic_same_v =
-  std::is_same_v<std::remove_const_t<T>, std::remove_const_t<U>>;
-}  // namespace raft_proto
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw.hpp b/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw.hpp
deleted file mode 100644
index 3524cc4..0000000
--- a/cpp/include/nvforest/detail/raft_proto/detail/host_only_throw.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-#include <nvforest/detail/raft_proto/detail/host_only_throw/base.hpp>
-#include <nvforest/detail/raft_proto/detail/host_only_throw/cpu.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
-
-namespace raft_proto {
-template <typename T, bool host = !GPU_COMPILATION>
-using host_only_throw = detail::host_only_throw<T, host>;
-}  // namespace raft_proto
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer.hpp b/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer.hpp
deleted file mode 100644
index 16d3d7c..0000000
--- a/cpp/include/nvforest/detail/raft_proto/detail/non_owning_buffer.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-#include <nvforest/detail/raft_proto/detail/non_owning_buffer/base.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-
-namespace raft_proto {
-template <device_type D, typename T>
-using non_owning_buffer = detail::non_owning_buffer<D, T>;
-}  // namespace raft_proto
diff --git a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer.hpp b/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer.hpp
deleted file mode 100644
index a8e0797..0000000
--- a/cpp/include/nvforest/detail/raft_proto/detail/owning_buffer.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-#include <nvforest/detail/raft_proto/detail/owning_buffer/cpu.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#ifdef NVFOREST_ENABLE_GPU
-#include <nvforest/detail/raft_proto/detail/owning_buffer/gpu.hpp>
-#endif
-namespace raft_proto {
-template <device_type D, typename T>
-using owning_buffer = detail::owning_buffer<D, T>;
-}  // namespace raft_proto
diff --git a/cpp/include/nvforest/detail/raft_proto/device_id.hpp b/cpp/include/nvforest/detail/raft_proto/device_id.hpp
deleted file mode 100644
index 0f6b1d9..0000000
--- a/cpp/include/nvforest/detail/raft_proto/device_id.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-
-#include <nvforest/detail/raft_proto/detail/device_id/base.hpp>
-#include <nvforest/detail/raft_proto/detail/device_id/cpu.hpp>
-#ifdef NVFOREST_ENABLE_GPU
-#include <nvforest/detail/raft_proto/detail/device_id/gpu.hpp>
-#endif
-#include <nvforest/detail/raft_proto/device_type.hpp>
-
-#include <variant>
-
-namespace raft_proto {
-template <device_type D>
-using device_id = detail::device_id<D>;
-
-using device_id_variant = std::variant<device_id<device_type::cpu>, device_id<device_type::gpu>>;
-}  // namespace raft_proto
diff --git a/cpp/include/nvforest/detail/raft_proto/device_setter.hpp b/cpp/include/nvforest/detail/raft_proto/device_setter.hpp
deleted file mode 100644
index fd729ff..0000000
--- a/cpp/include/nvforest/detail/raft_proto/device_setter.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-#pragma once
-#include <nvforest/detail/raft_proto/detail/device_setter/base.hpp>
-#ifdef NVFOREST_ENABLE_GPU
-#include <nvforest/detail/raft_proto/detail/device_setter/gpu.hpp>
-#endif
-#include <nvforest/detail/raft_proto/device_type.hpp>
-
-namespace raft_proto {
-
-using device_setter = detail::device_setter<device_type::gpu>;
-
-}
diff --git a/cpp/include/nvforest/detail/raft_proto/device_type.hpp b/cpp/include/nvforest/detail/raft_proto/device_type.hpp
deleted file mode 100644
index 730ce48..0000000
--- a/cpp/include/nvforest/detail/raft_proto/device_type.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-namespace raft_proto {
-enum class device_type { cpu, gpu };
-}
diff --git a/cpp/include/nvforest/detail/specializations/device_initialization_macros.hpp b/cpp/include/nvforest/detail/specializations/device_initialization_macros.hpp
index 578d44e..800de94 100644
--- a/cpp/include/nvforest/detail/specializations/device_initialization_macros.hpp
+++ b/cpp/include/nvforest/detail/specializations/device_initialization_macros.hpp
@@ -3,12 +3,11 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/detail/device_id.hpp>
 #include <nvforest/detail/specializations/forest_macros.hpp>
+#include <nvforest/device_type.hpp>
 /* Declare device initialization function for the types specified by the given
  * variant index */
-#define NVFOREST_INITIALIZE_DEVICE(template_type, variant_index)                     \
-  template_type void                                                                 \
-    initialize_device<NVFOREST_FOREST(variant_index), raft_proto::device_type::gpu>( \
-      raft_proto::device_id<raft_proto::device_type::gpu>);
+#define NVFOREST_INITIALIZE_DEVICE(template_type, variant_index)                          \
+  template_type void initialize_device<NVFOREST_FOREST(variant_index), device_type::gpu>( \
+    device_id<device_type::gpu>);
diff --git a/cpp/include/nvforest/detail/specializations/infer_macros.hpp b/cpp/include/nvforest/detail/specializations/infer_macros.hpp
index 1fba70f..7e18c9c 100644
--- a/cpp/include/nvforest/detail/specializations/infer_macros.hpp
+++ b/cpp/include/nvforest/detail/specializations/infer_macros.hpp
@@ -4,14 +4,14 @@
  */
 #pragma once
 #include <nvforest/constants.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/device_id.hpp>
 #include <nvforest/detail/forest.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/postprocessor.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_id.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
 #include <nvforest/detail/specialization_types.hpp>
 #include <nvforest/detail/specializations/forest_macros.hpp>
+#include <nvforest/device_type.hpp>
 #include <nvforest/infer_kind.hpp>
 
 #include <cstddef>
@@ -31,8 +31,8 @@
    std::nullptr_t,                                                     \
    infer_kind,                                                         \
    std::optional<index_type>,                                          \
-   raft_proto::device_id<dev>,                                         \
-   raft_proto::cuda_stream stream)
+   device_id<dev>,                                                     \
+   cuda_stream stream)
 
 /* Macro which expands to the valid arguments to an inference call for a forest
  * model with vector leaves but without non-local categorical data.*/
@@ -48,8 +48,8 @@
    std::nullptr_t,                                                     \
    infer_kind,                                                         \
    std::optional<index_type>,                                          \
-   raft_proto::device_id<dev>,                                         \
-   raft_proto::cuda_stream stream)
+   device_id<dev>,                                                     \
+   cuda_stream stream)
 
 /* Macro which expands to the valid arguments to an inference call for a forest
  * model without vector leaves but with non-local categorical data.*/
@@ -65,8 +65,8 @@
    NVFOREST_SPEC(variant_index)::index_type*,                          \
    infer_kind,                                                         \
    std::optional<index_type>,                                          \
-   raft_proto::device_id<dev>,                                         \
-   raft_proto::cuda_stream stream)
+   device_id<dev>,                                                     \
+   cuda_stream stream)
 
 /* Macro which expands to the valid arguments to an inference call for a forest
  * model with vector leaves and with non-local categorical data.*/
@@ -82,8 +82,8 @@
    NVFOREST_SPEC(variant_index)::index_type*,                          \
    infer_kind,                                                         \
    std::optional<index_type>,                                          \
-   raft_proto::device_id<dev>,                                         \
-   raft_proto::cuda_stream stream)
+   device_id<dev>,                                                     \
+   cuda_stream stream)
 
 /* Macro which expands to the declaration of an inference template for a forest
  * of the type indicated by the variant index */
diff --git a/cpp/include/nvforest/device_type.hpp b/cpp/include/nvforest/device_type.hpp
new file mode 100644
index 0000000..1478d6e
--- /dev/null
+++ b/cpp/include/nvforest/device_type.hpp
@@ -0,0 +1,8 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+namespace nvforest {
+enum class device_type { cpu, gpu };
+}  // namespace nvforest
diff --git a/cpp/include/nvforest/forest_model.hpp b/cpp/include/nvforest/forest_model.hpp
index 9c07f13..d3fc7b2 100644
--- a/cpp/include/nvforest/forest_model.hpp
+++ b/cpp/include/nvforest/forest_model.hpp
@@ -3,12 +3,16 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
+#include <nvforest/buffer.hpp>
+#include <nvforest/cuda_stream.hpp>
 #include <nvforest/decision_forest.hpp>
+#include <nvforest/detail/ceildiv.hpp>
+#include <nvforest/detail/copy.hpp>
+#include <nvforest/detail/cuda_check.hpp>
+#include <nvforest/detail/gpu_support.hpp>
 #include <nvforest/detail/index_type.hpp>
-#include <nvforest/detail/raft_proto/buffer.hpp>
-#include <nvforest/detail/raft_proto/cuda_check.hpp>
-#include <nvforest/detail/raft_proto/gpu_support.hpp>
-#include <nvforest/detail/raft_proto/handle.hpp>
+#include <nvforest/device_type.hpp>
+#include <nvforest/handle.hpp>
 #include <nvforest/infer_kind.hpp>
 
 #ifdef NVFOREST_ENABLE_GPU
@@ -116,7 +120,7 @@ struct forest_model {
    * @param[out] output The buffer where model output should be stored.
    * This must be of size at least ROWS x num_outputs().
    * @param[in] input The buffer containing input data.
-   * @param[in] stream A raft_proto::cuda_stream, which (on GPU-enabled builds) is
+   * @param[in] stream A nvforest::cuda_stream, which (on GPU-enabled builds) is
    * a transparent wrapper for the cudaStream_t or (on CPU-only builds) a
    * CUDA-free placeholder object.
    * @param[in] predict_type Type of inference to perform. Defaults to summing
@@ -134,9 +138,9 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(raft_proto::buffer<io_t>& output,
-               raft_proto::buffer<io_t> const& input,
-               raft_proto::cuda_stream stream                 = raft_proto::cuda_stream{},
+  void predict(buffer<io_t>& output,
+               buffer<io_t> const& input,
+               cuda_stream stream                             = cuda_stream{},
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
@@ -157,7 +161,7 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t
+   * @param[in] handle The nvforest::handle_t (wrapper for raft::handle_t
    * on GPU) which will be used to provide streams for evaluation.
    * @param[out] output The buffer where model output should be stored. If
    * this buffer is on host while the model is on device or vice versa,
@@ -182,9 +186,9 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(raft_proto::handle_t const& handle,
-               raft_proto::buffer<io_t>& output,
-               raft_proto::buffer<io_t> const& input,
+  void predict(handle_t const& handle,
+               buffer<io_t>& output,
+               buffer<io_t> const& input,
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
@@ -202,48 +206,44 @@ struct forest_model {
 
             auto row_count = input.size() / num_features();
             auto partition_size =
-              std::max(raft_proto::ceildiv(row_count, handle.get_usable_stream_count()),
+              std::max(detail::ceildiv(row_count, handle.get_usable_stream_count()),
                        specified_chunk_size.value_or(MAX_CHUNK_SIZE) * MIN_CHUNKS_PER_PARTITION);
-            auto partition_count = raft_proto::ceildiv(row_count, partition_size);
+            auto partition_count = detail::ceildiv(row_count, partition_size);
             for (auto i = std::size_t{}; i < partition_count; ++i) {
               auto stream = handle.get_next_usable_stream();
               auto rows_in_this_partition =
                 std::min(partition_size, row_count - i * partition_size);
-              auto partition_in = raft_proto::buffer<io_t>{};
+              auto partition_in = buffer<io_t>{};
               if (input.memory_type() != memory_type()) {
-                partition_in =
-                  raft_proto::buffer<io_t>{rows_in_this_partition * num_features(), memory_type()};
-                raft_proto::copy<raft_proto::DEBUG_ENABLED>(partition_in,
-                                                            input,
-                                                            0,
-                                                            i * partition_size * num_features(),
-                                                            partition_in.size(),
-                                                            stream);
+                partition_in = buffer<io_t>{rows_in_this_partition * num_features(), memory_type()};
+                copy<detail::DEBUG_ENABLED>(partition_in,
+                                            input,
+                                            0,
+                                            i * partition_size * num_features(),
+                                            partition_in.size(),
+                                            stream);
               } else {
-                partition_in =
-                  raft_proto::buffer<io_t>{input.data() + i * partition_size * num_features(),
-                                           rows_in_this_partition * num_features(),
-                                           memory_type()};
+                partition_in = buffer<io_t>{input.data() + i * partition_size * num_features(),
+                                            rows_in_this_partition * num_features(),
+                                            memory_type()};
               }
-              auto partition_out = raft_proto::buffer<io_t>{};
+              auto partition_out = buffer<io_t>{};
               if (output.memory_type() != memory_type()) {
-                partition_out =
-                  raft_proto::buffer<io_t>{rows_in_this_partition * num_outputs(), memory_type()};
+                partition_out = buffer<io_t>{rows_in_this_partition * num_outputs(), memory_type()};
               } else {
-                partition_out =
-                  raft_proto::buffer<io_t>{output.data() + i * partition_size * num_outputs(),
-                                           rows_in_this_partition * num_outputs(),
-                                           memory_type()};
+                partition_out = buffer<io_t>{output.data() + i * partition_size * num_outputs(),
+                                             rows_in_this_partition * num_outputs(),
+                                             memory_type()};
               }
               concrete_forest.predict(
                 partition_out, partition_in, stream, predict_type, specified_chunk_size);
               if (output.memory_type() != memory_type()) {
-                raft_proto::copy<raft_proto::DEBUG_ENABLED>(output,
-                                                            partition_out,
-                                                            i * partition_size * num_outputs(),
-                                                            0,
-                                                            partition_out.size(),
-                                                            stream);
+                copy<detail::DEBUG_ENABLED>(output,
+                                            partition_out,
+                                            i * partition_size * num_outputs(),
+                                            0,
+                                            partition_out.size(),
+                                            stream);
               }
             }
           }
@@ -257,7 +257,7 @@ struct forest_model {
   /**
    * Perform inference on given input
    *
-   * @param[in] handle The raft_proto::handle_t (wrapper for raft::handle_t
+   * @param[in] handle The nvforest::handle_t (wrapper for raft::handle_t
    * on GPU) which will be used to provide streams for evaluation.
    * @param[out] output Pointer to the memory location where output should end
    * up
@@ -281,30 +281,27 @@ struct forest_model {
    * reasonable value. On CPU, this argument can generally just be omitted.
    */
   template <typename io_t>
-  void predict(raft_proto::handle_t const& handle,
+  void predict(handle_t const& handle,
                io_t* output,
                io_t* input,
                std::size_t num_rows,
-               raft_proto::device_type out_mem_type,
-               raft_proto::device_type in_mem_type,
+               device_type out_mem_type,
+               device_type in_mem_type,
                infer_kind predict_type                        = infer_kind::default_kind,
                std::optional<index_type> specified_chunk_size = std::nullopt)
   {
     int current_device_id;
-    if (out_mem_type == raft_proto::device_type::gpu ||
-        in_mem_type == raft_proto::device_type::gpu) {
+    if (out_mem_type == device_type::gpu || in_mem_type == device_type::gpu) {
 #ifdef NVFOREST_ENABLE_GPU
-      raft_proto::cuda_check(cudaGetDevice(&current_device_id));
+      detail::cuda_check(cudaGetDevice(&current_device_id));
 #else
-      throw raft_proto::gpu_unsupported("Tried to use GPU memory in CPU-only build");
+      throw detail::gpu_unsupported("Tried to use GPU memory in CPU-only build");
 #endif
     } else {
       current_device_id = -1;
     }
-    auto out_buffer =
-      raft_proto::buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id};
-    auto in_buffer =
-      raft_proto::buffer{input, num_rows * num_features(), in_mem_type, current_device_id};
+    auto out_buffer = buffer{output, num_rows * num_outputs(), out_mem_type, current_device_id};
+    auto in_buffer  = buffer{input, num_rows * num_features(), in_mem_type, current_device_id};
     predict(handle, out_buffer, in_buffer, predict_type, specified_chunk_size);
   }
 
diff --git a/cpp/include/nvforest/detail/raft_proto/handle.hpp b/cpp/include/nvforest/handle.hpp
similarity index 80%
rename from cpp/include/nvforest/detail/raft_proto/handle.hpp
rename to cpp/include/nvforest/handle.hpp
index 086d61b..861f09b 100644
--- a/cpp/include/nvforest/detail/raft_proto/handle.hpp
+++ b/cpp/include/nvforest/handle.hpp
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
+#include <nvforest/cuda_stream.hpp>
 
 #include <algorithm>
 #include <cstddef>
@@ -11,14 +11,14 @@
 #include <raft/core/handle.hpp>
 #endif
 
-namespace raft_proto {
+namespace nvforest {
 #ifdef NVFOREST_ENABLE_GPU
 struct handle_t {
   handle_t(raft::handle_t const* handle_ptr = nullptr) : raft_handle_{handle_ptr} {}
   handle_t(raft::handle_t const& raft_handle) : raft_handle_{&raft_handle} {}
   auto get_next_usable_stream() const
   {
-    return raft_proto::cuda_stream{raft_handle_->get_next_usable_stream().value()};
+    return cuda_stream{raft_handle_->get_next_usable_stream().value()};
   }
   auto get_stream_pool_size() const { return raft_handle_->get_stream_pool_size(); }
   auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); }
@@ -34,10 +34,10 @@ struct handle_t {
 };
 #else
 struct handle_t {
-  auto get_next_usable_stream() const { return raft_proto::cuda_stream{}; }
+  auto get_next_usable_stream() const { return cuda_stream{}; }
   auto get_stream_pool_size() const { return std::size_t{}; }
   auto get_usable_stream_count() const { return std::max(get_stream_pool_size(), std::size_t{1}); }
   void synchronize() const {}
 };
 #endif
-}  // namespace raft_proto
+}  // namespace nvforest
diff --git a/cpp/include/nvforest/treelite_importer.hpp b/cpp/include/nvforest/treelite_importer.hpp
index de4d06b..927f8e5 100644
--- a/cpp/include/nvforest/treelite_importer.hpp
+++ b/cpp/include/nvforest/treelite_importer.hpp
@@ -4,11 +4,13 @@
  */
 #pragma once
 #include <nvforest/constants.hpp>
+#include <nvforest/cuda_stream.hpp>
 #include <nvforest/decision_forest.hpp>
 #include <nvforest/detail/decision_forest_builder.hpp>
 #include <nvforest/detail/degenerate_trees.hpp>
 #include <nvforest/detail/index_type.hpp>
 #include <nvforest/detail/integration/treelite.hpp>
+#include <nvforest/device_type.hpp>
 #include <nvforest/exceptions.hpp>
 #include <nvforest/forest_model.hpp>
 #include <nvforest/postproc_ops.hpp>
@@ -245,10 +247,10 @@ struct treelite_importer {
                                   index_type num_feature,
                                   index_type max_num_categories,
                                   std::vector<index_type> const& offsets,
-                                  index_type align_bytes           = index_type{},
-                                  raft_proto::device_type mem_type = raft_proto::device_type::cpu,
-                                  int device                       = 0,
-                                  raft_proto::cuda_stream stream   = raft_proto::cuda_stream{})
+                                  index_type align_bytes         = index_type{},
+                                  nvforest::device_type mem_type = nvforest::device_type::cpu,
+                                  int device                     = 0,
+                                  nvforest::cuda_stream stream   = nvforest::cuda_stream{})
   {
     auto result = decision_forest_variant{};
     if constexpr (variant_index != std::variant_size_v<decision_forest_variant>) {
@@ -362,9 +364,9 @@ struct treelite_importer {
   forest_model import(treelite::Model const& tl_model,
                       index_type align_bytes                   = index_type{},
                       std::optional<bool> use_double_precision = std::nullopt,
-                      raft_proto::device_type dev_type         = raft_proto::device_type::cpu,
+                      nvforest::device_type dev_type           = nvforest::device_type::cpu,
                       int device                               = 0,
-                      raft_proto::cuda_stream stream           = raft_proto::cuda_stream{})
+                      nvforest::cuda_stream stream             = nvforest::cuda_stream{})
   {
     validate_model_shape(tl_model);
 
@@ -460,14 +462,13 @@ struct treelite_importer {
  * @param stream The CUDA stream to use for loading this model (can be
  * omitted for CPU).
  */
-inline auto import_from_treelite_model(
-  treelite::Model const& tl_model,
-  tree_layout layout                       = preferred_tree_layout,
-  index_type align_bytes                   = index_type{},
-  std::optional<bool> use_double_precision = std::nullopt,
-  raft_proto::device_type dev_type         = raft_proto::device_type::cpu,
-  int device                               = 0,
-  raft_proto::cuda_stream stream           = raft_proto::cuda_stream{})
+inline auto import_from_treelite_model(treelite::Model const& tl_model,
+                                       tree_layout layout     = preferred_tree_layout,
+                                       index_type align_bytes = index_type{},
+                                       std::optional<bool> use_double_precision = std::nullopt,
+                                       nvforest::device_type dev_type = nvforest::device_type::cpu,
+                                       int device                     = 0,
+                                       nvforest::cuda_stream stream   = nvforest::cuda_stream{})
 {
   auto result = forest_model{};
   switch (layout) {
@@ -511,14 +512,13 @@ inline auto import_from_treelite_model(
  * @param stream The CUDA stream to use for loading this model (can be
  * omitted for CPU).
  */
-inline auto import_from_treelite_handle(
-  TreeliteModelHandle tl_handle,
-  tree_layout layout                       = preferred_tree_layout,
-  index_type align_bytes                   = index_type{},
-  std::optional<bool> use_double_precision = std::nullopt,
-  raft_proto::device_type dev_type         = raft_proto::device_type::cpu,
-  int device                               = 0,
-  raft_proto::cuda_stream stream           = raft_proto::cuda_stream{})
+inline auto import_from_treelite_handle(TreeliteModelHandle tl_handle,
+                                        tree_layout layout     = preferred_tree_layout,
+                                        index_type align_bytes = index_type{},
+                                        std::optional<bool> use_double_precision = std::nullopt,
+                                        nvforest::device_type dev_type = nvforest::device_type::cpu,
+                                        int device                     = 0,
+                                        nvforest::cuda_stream stream   = nvforest::cuda_stream{})
 {
   return import_from_treelite_model(*static_cast<treelite::Model*>(tl_handle),
                                     layout,
diff --git a/cpp/src/infer0.cpp b/cpp/src/infer0.cpp
index e77ba2f..81d59c3 100644
--- a/cpp/src/infer0.cpp
+++ b/cpp/src/infer0.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 0)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 0)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer0.cu b/cpp/src/infer0.cu
index 16f8f35..ff0b117 100644
--- a/cpp/src/infer0.cu
+++ b/cpp/src/infer0.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 0)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 0)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 0)
diff --git a/cpp/src/infer1.cpp b/cpp/src/infer1.cpp
index a5c1be9..cb83347 100644
--- a/cpp/src/infer1.cpp
+++ b/cpp/src/infer1.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 1)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 1)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer1.cu b/cpp/src/infer1.cu
index bd24cb8..43ebbcb 100644
--- a/cpp/src/infer1.cu
+++ b/cpp/src/infer1.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 1)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 1)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 1)
diff --git a/cpp/src/infer10.cpp b/cpp/src/infer10.cpp
index 1f3b000..d7ee644 100644
--- a/cpp/src/infer10.cpp
+++ b/cpp/src/infer10.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 10)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 10)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer10.cu b/cpp/src/infer10.cu
index fae886d..f91864a 100644
--- a/cpp/src/infer10.cu
+++ b/cpp/src/infer10.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 10)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 10)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 10)
diff --git a/cpp/src/infer11.cpp b/cpp/src/infer11.cpp
index 88df365..8cfcbae 100644
--- a/cpp/src/infer11.cpp
+++ b/cpp/src/infer11.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 11)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 11)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer11.cu b/cpp/src/infer11.cu
index e74034d..22450a2 100644
--- a/cpp/src/infer11.cu
+++ b/cpp/src/infer11.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 11)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 11)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 11)
diff --git a/cpp/src/infer2.cpp b/cpp/src/infer2.cpp
index 2b44e54..bd47291 100644
--- a/cpp/src/infer2.cpp
+++ b/cpp/src/infer2.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 2)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 2)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer2.cu b/cpp/src/infer2.cu
index e232010..acbcda0 100644
--- a/cpp/src/infer2.cu
+++ b/cpp/src/infer2.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 2)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 2)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 2)
diff --git a/cpp/src/infer3.cpp b/cpp/src/infer3.cpp
index 958dde8..59c0130 100644
--- a/cpp/src/infer3.cpp
+++ b/cpp/src/infer3.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 3)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 3)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer3.cu b/cpp/src/infer3.cu
index d70481e..9f3e49e 100644
--- a/cpp/src/infer3.cu
+++ b/cpp/src/infer3.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 3)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 3)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 3)
diff --git a/cpp/src/infer4.cpp b/cpp/src/infer4.cpp
index 88a15c2..d8f500e 100644
--- a/cpp/src/infer4.cpp
+++ b/cpp/src/infer4.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 4)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 4)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer4.cu b/cpp/src/infer4.cu
index 7c0e985..74509a4 100644
--- a/cpp/src/infer4.cu
+++ b/cpp/src/infer4.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 4)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 4)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 4)
diff --git a/cpp/src/infer5.cpp b/cpp/src/infer5.cpp
index 09105dc..fed7c30 100644
--- a/cpp/src/infer5.cpp
+++ b/cpp/src/infer5.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 5)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 5)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer5.cu b/cpp/src/infer5.cu
index e243630..c098803 100644
--- a/cpp/src/infer5.cu
+++ b/cpp/src/infer5.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 5)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 5)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 5)
diff --git a/cpp/src/infer6.cpp b/cpp/src/infer6.cpp
index 5bebe27..4ad13b5 100644
--- a/cpp/src/infer6.cpp
+++ b/cpp/src/infer6.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 6)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 6)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer6.cu b/cpp/src/infer6.cu
index b5face9..8d1d1db 100644
--- a/cpp/src/infer6.cu
+++ b/cpp/src/infer6.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 6)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 6)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 6)
diff --git a/cpp/src/infer7.cpp b/cpp/src/infer7.cpp
index e8f3e5b..8bfd2d9 100644
--- a/cpp/src/infer7.cpp
+++ b/cpp/src/infer7.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 7)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 7)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer7.cu b/cpp/src/infer7.cu
index 4104c82..c722005 100644
--- a/cpp/src/infer7.cu
+++ b/cpp/src/infer7.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 7)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 7)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 7)
diff --git a/cpp/src/infer8.cpp b/cpp/src/infer8.cpp
index 3597baf..c190556 100644
--- a/cpp/src/infer8.cpp
+++ b/cpp/src/infer8.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 8)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 8)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer8.cu b/cpp/src/infer8.cu
index 2278699..bad5de4 100644
--- a/cpp/src/infer8.cu
+++ b/cpp/src/infer8.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 8)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 8)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 8)
diff --git a/cpp/src/infer9.cpp b/cpp/src/infer9.cpp
index 9f84457..7bd9234 100644
--- a/cpp/src/infer9.cpp
+++ b/cpp/src/infer9.cpp
@@ -5,5 +5,5 @@
 #include <nvforest/detail/infer/cpu.hpp>
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail::inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::cpu, 9)
+NVFOREST_INFER_ALL(template, nvforest::device_type::cpu, 9)
 }  // namespace nvforest::detail::inference
diff --git a/cpp/src/infer9.cu b/cpp/src/infer9.cu
index 052b5db..4a26bdf 100644
--- a/cpp/src/infer9.cu
+++ b/cpp/src/infer9.cu
@@ -8,7 +8,7 @@
 #include <nvforest/detail/specializations/infer_macros.hpp>
 namespace nvforest::detail {
 namespace inference {
-NVFOREST_INFER_ALL(template, raft_proto::device_type::gpu, 9)
+NVFOREST_INFER_ALL(template, nvforest::device_type::gpu, 9)
 }  // namespace inference
 namespace device_initialization {
 NVFOREST_INITIALIZE_DEVICE(template, 9)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4430c46..0595f8a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -80,8 +80,8 @@ endfunction()
 
 # ######################################################################################################################
 # * Build ml_test executable -------------------------------------------------
-ConfigureTest(NAME HOST_BUFFER_TEST raft_proto/buffer.cpp)
-ConfigureTest(NAME DEVICE_BUFFER_TEST raft_proto/buffer.cu)
+ConfigureTest(NAME HOST_BUFFER_TEST buffer/buffer.cpp)
+ConfigureTest(NAME DEVICE_BUFFER_TEST buffer/buffer.cu)
 ConfigureTest(NAME FOREST_TRAVERSAL_TEST forest/traversal_forest.cpp)
 ConfigureTest(NAME TREELITE_TRAVERSAL_TEST forest/treelite_traversal.cpp)
 ConfigureTest(NAME TREELITE_IMPORTER_TEST treelite_importer.cpp treelite_importer_invalid_inputs.cpp
diff --git a/cpp/tests/raft_proto/buffer.cpp b/cpp/tests/buffer/buffer.cpp
similarity index 92%
rename from cpp/tests/raft_proto/buffer.cpp
rename to cpp/tests/buffer/buffer.cpp
index 6f6e940..7bf264b 100644
--- a/cpp/tests/raft_proto/buffer.cpp
+++ b/cpp/tests/buffer/buffer.cpp
@@ -3,16 +3,16 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include <nvforest/detail/raft_proto/buffer.hpp>
-#include <nvforest/detail/raft_proto/cuda_check.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/exceptions.hpp>
+#include <nvforest/buffer.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/cuda_check.hpp>
+#include <nvforest/detail/exceptions.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
-namespace raft_proto {
+namespace nvforest {
 
 TEST(Buffer, default_buffer)
 {
@@ -202,10 +202,10 @@ TEST(Buffer, copy_buffer)
     test_dev_buffers.emplace_back(orig_buffer, device_type::gpu, 0, cuda_stream{});
     for (auto& dev_buf : test_dev_buffers) {
       data_out = std::vector<int>(data.size());
-      cuda_check(cudaMemcpy(static_cast<void*>(data_out.data()),
-                            static_cast<void*>(dev_buf.data()),
-                            dev_buf.size() * sizeof(int),
-                            cudaMemcpyDefault));
+      detail::cuda_check(cudaMemcpy(static_cast<void*>(data_out.data()),
+                                    static_cast<void*>(dev_buf.data()),
+                                    dev_buf.size() * sizeof(int),
+                                    cudaMemcpyDefault));
       EXPECT_THAT(data_out, ::testing::ElementsAreArray(data));
 
       auto test_dev_copies = std::vector<buffer<int>>{};
@@ -214,10 +214,10 @@ TEST(Buffer, copy_buffer)
       test_dev_copies.emplace_back(dev_buf, device_type::gpu, 0, cuda_stream{});
       for (auto& copy_buf : test_dev_copies) {
         data_out = std::vector<int>(data.size());
-        cuda_check(cudaMemcpy(static_cast<void*>(data_out.data()),
-                              static_cast<void*>(copy_buf.data()),
-                              copy_buf.size() * sizeof(int),
-                              cudaMemcpyDefault));
+        detail::cuda_check(cudaMemcpy(static_cast<void*>(data_out.data()),
+                                      static_cast<void*>(copy_buf.data()),
+                                      copy_buf.size() * sizeof(int),
+                                      cudaMemcpyDefault));
         EXPECT_THAT(data_out, ::testing::ElementsAreArray(data));
       }
 
@@ -268,10 +268,10 @@ TEST(Buffer, move_buffer)
     ASSERT_NE(buf.data(), data.data());
 
     auto data_out = std::vector<int>(buf.size());
-    cuda_check(cudaMemcpy(static_cast<void*>(data_out.data()),
-                          static_cast<void*>(buf.data()),
-                          buf.size() * sizeof(int),
-                          cudaMemcpyDefault));
+    detail::cuda_check(cudaMemcpy(static_cast<void*>(data_out.data()),
+                                  static_cast<void*>(buf.data()),
+                                  buf.size() * sizeof(int),
+                                  cudaMemcpyDefault));
     EXPECT_THAT(data_out, ::testing::ElementsAreArray(data));
   }
 #endif
@@ -306,7 +306,7 @@ TEST(Buffer, partial_buffer_copy)
   auto buf2 = buffer<int>{data2.data(), data2.size(), device_type::cpu};
   copy<true>(buf2, buf1, 1, 2, 3, cuda_stream{});
   copy<false>(buf2, buf1, 1, 2, 3, cuda_stream{});
-  EXPECT_THROW(copy<true>(buf2, buf1, 1, 2, 4, cuda_stream{}), out_of_bounds);
+  EXPECT_THROW(copy<true>(buf2, buf1, 1, 2, 4, cuda_stream{}), detail::out_of_bounds);
 }
 
 TEST(Buffer, buffer_copy_overloads)
@@ -360,4 +360,4 @@ TEST(Buffer, buffer_copy_overloads)
 #endif
 }
 
-}  // namespace raft_proto
+}  // namespace nvforest
diff --git a/cpp/tests/raft_proto/buffer.cu b/cpp/tests/buffer/buffer.cu
similarity index 79%
rename from cpp/tests/raft_proto/buffer.cu
rename to cpp/tests/buffer/buffer.cu
index 001b5f2..a114724 100644
--- a/cpp/tests/raft_proto/buffer.cu
+++ b/cpp/tests/buffer/buffer.cu
@@ -3,11 +3,11 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include <nvforest/detail/raft_proto/buffer.hpp>
-#include <nvforest/detail/raft_proto/cuda_check.hpp>
-#include <nvforest/detail/raft_proto/cuda_stream.hpp>
-#include <nvforest/detail/raft_proto/device_type.hpp>
+#include <nvforest/buffer.hpp>
+#include <nvforest/cuda_stream.hpp>
+#include <nvforest/detail/cuda_check.hpp>
 #include <nvforest/detail/utils.hpp>
+#include <nvforest/device_type.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -16,7 +16,7 @@
 
 #include <iostream>
 
-namespace raft_proto {
+namespace nvforest {
 
 NVFOREST_KERNEL void check_buffer_access(int* buf)
 {
@@ -39,4 +39,4 @@ TEST(Buffer, device_buffer_access)
   EXPECT_THAT(data_out, testing::ElementsAreArray(expected));
 }
 
-}  // namespace raft_proto
+}  // namespace nvforest
diff --git a/cpp/tests/treelite_importer.cpp b/cpp/tests/treelite_importer.cpp
index 528aebb..5a67bf6 100644
--- a/cpp/tests/treelite_importer.cpp
+++ b/cpp/tests/treelite_importer.cpp
@@ -3,8 +3,8 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include <nvforest/detail/raft_proto/device_type.hpp>
-#include <nvforest/detail/raft_proto/handle.hpp>
+#include <nvforest/device_type.hpp>
+#include <nvforest/handle.hpp>
 #include <nvforest/postproc_ops.hpp>
 #include <nvforest/tree_layout.hpp>
 #include <nvforest/treelite_importer.hpp>
@@ -255,7 +255,7 @@ TEST(TreeliteImporter, depth_first)
   ASSERT_FALSE(nvforest_model.has_vector_leaves());
   ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable);
   ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable);
-  ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu);
+  ASSERT_EQ(nvforest_model.memory_type(), nvforest::device_type::cpu);
   ASSERT_EQ(nvforest_model.device_index(), -1);
   ASSERT_FALSE(nvforest_model.is_double_precision());
 }
@@ -269,7 +269,7 @@ TEST(TreeliteImporter, breadth_first)
   ASSERT_FALSE(nvforest_model.has_vector_leaves());
   ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable);
   ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable);
-  ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu);
+  ASSERT_EQ(nvforest_model.memory_type(), nvforest::device_type::cpu);
   ASSERT_EQ(nvforest_model.device_index(), -1);
   ASSERT_FALSE(nvforest_model.is_double_precision());
 }
@@ -284,7 +284,7 @@ TEST(TreeliteImporter, layered_children_together)
   ASSERT_FALSE(nvforest_model.has_vector_leaves());
   ASSERT_EQ(nvforest_model.row_postprocessing(), row_op::disable);
   ASSERT_EQ(nvforest_model.elem_postprocessing(), element_op::disable);
-  ASSERT_EQ(nvforest_model.memory_type(), raft_proto::device_type::cpu);
+  ASSERT_EQ(nvforest_model.memory_type(), nvforest::device_type::cpu);
   ASSERT_EQ(nvforest_model.device_index(), -1);
   ASSERT_FALSE(nvforest_model.is_double_precision());
 }
@@ -384,9 +384,9 @@ TEST(TreeliteImporter, DegenerateTree)
 
 #ifdef NVFOREST_ENABLE_GPU
   auto raft_handle = raft::handle_t{};
-  auto handle      = raft_proto::handle_t{raft_handle};
+  auto handle      = nvforest::handle_t{raft_handle};
 #else
-  auto handle = raft_proto::handle_t{};
+  auto handle = nvforest::handle_t{};
 #endif
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(1, 0.0);
@@ -395,8 +395,8 @@ TEST(TreeliteImporter, DegenerateTree)
                          preds.data(),
                          X.data(),
                          1,
-                         raft_proto::device_type::cpu,
-                         raft_proto::device_type::cpu,
+                         nvforest::device_type::cpu,
+                         nvforest::device_type::cpu,
                          nvforest::infer_kind::default_kind,
                          1);
   ASSERT_EQ(preds, expected_preds);
@@ -410,9 +410,9 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
 
 #ifdef NVFOREST_ENABLE_GPU
   auto raft_handle = raft::handle_t{};
-  auto handle      = raft_proto::handle_t{raft_handle};
+  auto handle      = nvforest::handle_t{raft_handle};
 #else
-  auto handle = raft_proto::handle_t{};
+  auto handle = nvforest::handle_t{};
 #endif
   auto X              = std::vector<double>{0.0};
   auto preds          = std::vector<double>(2, 0.0);
@@ -421,8 +421,8 @@ TEST(TreeliteImporter, DegenerateTreeWithVectorLeaf)
                          preds.data(),
                          X.data(),
                          1,
-                         raft_proto::device_type::cpu,
-                         raft_proto::device_type::cpu,
+                         nvforest::device_type::cpu,
+                         nvforest::device_type::cpu,
                          nvforest::infer_kind::default_kind,
                          1);
   ASSERT_EQ(preds, expected_preds);
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
index 8500861..2a3519f 100644
--- a/docs/source/getting_started.rst
+++ b/docs/source/getting_started.rst
@@ -151,7 +151,7 @@ nvForest provides a CMake config file so that other C++ projects can find and us
 
     find_package(nvforest CONFIG REQUIRED)
 
-    target_link_libraries(my_target PRIVATE nvforest::nvforest++ treelite::treelite)
+    target_link_libraries(my_target PRIVATE nvforest::nvforest treelite::treelite)
 
 To ensure that CMake can locate nvForest and Treelite, we recommend the use of Conda to install nvForest.
 
@@ -179,9 +179,9 @@ Once the tree model is available as a Treelite object, pass it to the
 .. code-block:: cpp
 
     #include <nvforest/constants.hpp>
+    #include <nvforest/device_type.hpp>
     #include <nvforest/treelite_importer.hpp>
     #include <nvforest/detail/index_type.hpp>
-    #include <nvforest/detail/raft_proto/device_type.hpp>
     #include <optional>
 
     auto fm = nvforest::import_from_treelite_model(
@@ -189,22 +189,22 @@ Once the tree model is available as a Treelite object, pass it to the
         nvforest::preferred_tree_layout,
         nvforest::index_type{},
         std::nullopt,
-        raft_proto::device_type::gpu);
+        nvforest::device_type::gpu);
 
 Now that the tree model is fully imported into nvForest, let's run inference:
 
 .. code-block:: cpp
 
     #include <raft/core/handle.hpp>
-    #include <nvforest/detail/raft_proto/handle.hpp>
+    #include <nvforest/handle.hpp>
 
     raft::handle_t raft_handle{};
-    raft_proto::handle_t handle{raft_handle};
+    nvforest::handle_t handle{raft_handle};
 
     // Assumption:
     // * Both output and input are in the GPU memory.
     // * The input buffer should be of dimension (num_rows, num_features)
     // * The output buffer should be of dimension (num_rows, fm.num_outputs())
     fm.predict(handle, output, input, num_rows,
-               raft_proto::device_type::gpu, raft_proto::device_type::gpu,
+               nvforest::device_type::gpu, nvforest::device_type::gpu,
                nvforest::infer_kind::default_kind);
diff --git a/python/libnvforest/CMakeLists.txt b/python/libnvforest/CMakeLists.txt
index 37eb6c1..e360602 100644
--- a/python/libnvforest/CMakeLists.txt
+++ b/python/libnvforest/CMakeLists.txt
@@ -30,7 +30,7 @@ unset(nvforest_FOUND)
 # --- nvForest --- #
 set(BUILD_NVFOREST_TESTS OFF)
 
-set(NVFOREST_CPP_TARGET "nvforest++")
+set(NVFOREST_CPP_TARGET "nvforest")
 set(NVFOREST_CPP_SRC "../../cpp")
 
 # --- raft --- #
diff --git a/python/libnvforest/libnvforest/load.py b/python/libnvforest/libnvforest/load.py
index c8fe52e..08f4a77 100644
--- a/python/libnvforest/libnvforest/load.py
+++ b/python/libnvforest/libnvforest/load.py
@@ -34,7 +34,7 @@ def _load_wheel_installation(soname: str):
 
 
 def load_library():
-    """Dynamically load libnvforest++.so and its dependencies"""
+    """Dynamically load libnvforest.so and its dependencies"""
     try:
         # These libraries must all be loaded before libnvforest
         import libraft
@@ -55,7 +55,7 @@ def load_library():
         != "false"
     )
 
-    soname = "libnvforest++.so"
+    soname = "libnvforest.so"
     libnvforest_lib = None
     if prefer_system_installation:
         # Prefer a system library if one is present to
diff --git a/python/nvforest/CMakeLists.txt b/python/nvforest/CMakeLists.txt
index 24333a1..90dac88 100644
--- a/python/nvforest/CMakeLists.txt
+++ b/python/nvforest/CMakeLists.txt
@@ -24,7 +24,7 @@ project(
 # * User Options  --------------------------------------------------------------
 option(USE_LIBNVFOREST_WHEEL "Use libnvforest wheel to provide some dependencies" OFF)
 
-set(NVFOREST_CPP_TARGET "nvforest++")
+set(NVFOREST_CPP_TARGET "nvforest")
 set(NVFOREST_CPP_SRC "../../cpp")
 
 # ######################################################################################################################
diff --git a/python/nvforest/nvforest/detail/raft_proto/cuda_stream.pxd b/python/nvforest/nvforest/detail/cuda_stream.pxd
similarity index 63%
rename from python/nvforest/nvforest/detail/raft_proto/cuda_stream.pxd
rename to python/nvforest/nvforest/detail/cuda_stream.pxd
index 899fed4..d12bf05 100644
--- a/python/nvforest/nvforest/detail/raft_proto/cuda_stream.pxd
+++ b/python/nvforest/nvforest/detail/cuda_stream.pxd
@@ -2,6 +2,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 #
-cdef extern from "nvforest/detail/raft_proto/cuda_stream.hpp" namespace "raft_proto" nogil:
+cdef extern from "nvforest/cuda_stream.hpp" namespace "nvforest" nogil:
     cdef cppclass cuda_stream:
         pass
diff --git a/python/nvforest/nvforest/detail/device_type.pxd b/python/nvforest/nvforest/detail/device_type.pxd
new file mode 100644
index 0000000..ea89f3a
--- /dev/null
+++ b/python/nvforest/nvforest/detail/device_type.pxd
@@ -0,0 +1,8 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+#
+cdef extern from "nvforest/device_type.hpp" namespace "nvforest" nogil:
+    cdef enum device_type:
+        cpu "nvforest::device_type::cpu",
+        gpu "nvforest::device_type::gpu"
diff --git a/python/nvforest/nvforest/detail/forest_inference.pyx b/python/nvforest/nvforest/detail/forest_inference.pyx
index 0340c59..ce15c5b 100644
--- a/python/nvforest/nvforest/detail/forest_inference.pyx
+++ b/python/nvforest/nvforest/detail/forest_inference.pyx
@@ -14,18 +14,14 @@ from nvforest.detail.treelite import safe_treelite_call
 
 from libc.stdint cimport uint32_t, uintptr_t
 from libcpp cimport bool
+from libcpp.optional cimport nullopt, optional
 from pylibraft.common.handle cimport handle_t as raft_handle_t
 
+from nvforest.detail.cuda_stream cimport cuda_stream as nvforest_stream_t
+from nvforest.detail.device_type cimport device_type as nvforest_device_t
+from nvforest.detail.handle cimport handle_t as nvforest_handle_t
 from nvforest.detail.infer_kind cimport infer_kind
 from nvforest.detail.postprocessing cimport element_op, row_op
-from nvforest.detail.raft_proto.cuda_stream cimport (
-    cuda_stream as raft_proto_stream_t,
-)
-from nvforest.detail.raft_proto.device_type cimport (
-    device_type as raft_proto_device_t,
-)
-from nvforest.detail.raft_proto.handle cimport handle_t as raft_proto_handle_t
-from nvforest.detail.raft_proto.optional cimport nullopt, optional
 from nvforest.detail.tree_layout cimport tree_layout as nvforest_tree_layout
 from nvforest.detail.treelite cimport (
     TreeliteDeserializeModelFromBytes,
@@ -37,12 +33,12 @@ from nvforest.detail.treelite cimport (
 cdef extern from "nvforest/forest_model.hpp" namespace "nvforest" nogil:
     cdef cppclass forest_model:
         void predict[io_t](
-            const raft_proto_handle_t&,
+            const nvforest_handle_t&,
             io_t*,
             io_t*,
             size_t,
-            raft_proto_device_t,
-            raft_proto_device_t,
+            nvforest_device_t,
+            nvforest_device_t,
             infer_kind,
             optional[uint32_t]
         ) except +
@@ -61,15 +57,15 @@ cdef extern from "nvforest/treelite_importer.hpp" namespace "nvforest" nogil:
         nvforest_tree_layout,
         uint32_t,
         optional[bool],
-        raft_proto_device_t,
+        nvforest_device_t,
         int,
-        raft_proto_stream_t
+        nvforest_stream_t
     ) except +
 
 
 cdef class ForestInference_impl():
     cdef forest_model model
-    cdef raft_proto_handle_t raft_proto_handle
+    cdef nvforest_handle_t nvforest_handle
     cdef object raft_handle
     cdef object device
 
@@ -84,10 +80,10 @@ cdef class ForestInference_impl():
         device: str = "cpu",
         device_id: Optional[int] = None,
     ):
-        # Store reference to RAFT handle to control lifetime, since raft_proto
-        # handle keeps a pointer to it
+        # Store reference to RAFT handle to control lifetime, since
+        # nvforest_handle keeps a pointer to it
         self.raft_handle = raft_handle
-        self.raft_proto_handle = raft_proto_handle_t(
+        self.nvforest_handle = nvforest_handle_t(
             <raft_handle_t*><size_t>self.raft_handle.getHandle()
         )
 
@@ -106,8 +102,8 @@ cdef class ForestInference_impl():
             "Failed to load Treelite model from bytes:"
         )
 
-        cdef raft_proto_device_t dev_type
-        dev_type = raft_proto_device_t.gpu if device == "gpu" else raft_proto_device_t.cpu
+        cdef nvforest_device_t dev_type
+        dev_type = nvforest_device_t.gpu if device == "gpu" else nvforest_device_t.cpu
         cdef nvforest_tree_layout tree_layout
         if layout.lower() == "depth_first":
             tree_layout = nvforest_tree_layout.depth_first
@@ -134,7 +130,7 @@ cdef class ForestInference_impl():
             use_double_precision_c,
             dev_type,
             device_id,
-            self.raft_proto_handle.get_next_usable_stream()
+            self.nvforest_handle.get_next_usable_stream()
         )
 
         safe_treelite_call(
@@ -188,9 +184,9 @@ cdef class ForestInference_impl():
         chunk_size: Optional[int] = None,
     ) -> DataType:
         cdef uintptr_t in_ptr
-        cdef raft_proto_device_t in_dev
+        cdef nvforest_device_t in_dev
         cdef uintptr_t out_ptr
-        cdef raft_proto_device_t out_dev
+        cdef nvforest_device_t out_dev
         cdef infer_kind infer_type_enum
         cdef optional[uint32_t] chunk_specification
 
@@ -220,9 +216,9 @@ cdef class ForestInference_impl():
                 order="C",
             )
             in_ptr = X.__array_interface__["data"][0]
-            in_dev = raft_proto_device_t.cpu
+            in_dev = nvforest_device_t.cpu
             out_ptr = preds.__array_interface__["data"][0]
-            out_dev = raft_proto_device_t.cpu
+            out_dev = nvforest_device_t.cpu
         else:
             assert self.device == "gpu"
             import cupy as cp
@@ -233,9 +229,9 @@ cdef class ForestInference_impl():
                 order="C",
             )
             in_ptr = X.__cuda_array_interface__["data"][0]
-            in_dev = raft_proto_device_t.gpu
+            in_dev = nvforest_device_t.gpu
             out_ptr = preds.__cuda_array_interface__["data"][0]
-            out_dev = raft_proto_device_t.gpu
+            out_dev = nvforest_device_t.gpu
 
         if chunk_size is None:
             chunk_specification = nullopt
@@ -244,7 +240,7 @@ cdef class ForestInference_impl():
 
         if model_dtype == np.float32:
             self.model.predict[float](
-                self.raft_proto_handle,
+                self.nvforest_handle,
                 <float *> out_ptr,
                 <float *> in_ptr,
                 n_rows,
@@ -255,7 +251,7 @@ cdef class ForestInference_impl():
             )
         else:
             self.model.predict[double](
-                self.raft_proto_handle,
+                self.nvforest_handle,
                 <double *> out_ptr,
                 <double *> in_ptr,
                 n_rows,
@@ -266,7 +262,7 @@ cdef class ForestInference_impl():
             )
 
         if self.device == "gpu":
-            self.raft_proto_handle.synchronize()
+            self.nvforest_handle.synchronize()
         return preds
 
 
diff --git a/python/nvforest/nvforest/detail/raft_proto/handle.pxd b/python/nvforest/nvforest/detail/handle.pxd
similarity index 61%
rename from python/nvforest/nvforest/detail/raft_proto/handle.pxd
rename to python/nvforest/nvforest/detail/handle.pxd
index 5ad107e..41c16a8 100644
--- a/python/nvforest/nvforest/detail/raft_proto/handle.pxd
+++ b/python/nvforest/nvforest/detail/handle.pxd
@@ -5,15 +5,13 @@
 
 from pylibraft.common.handle cimport handle_t as raft_handle_t
 
-from nvforest.detail.raft_proto.cuda_stream cimport (
-    cuda_stream as raft_proto_stream_t,
-)
+from nvforest.detail.cuda_stream cimport cuda_stream as nvforest_stream_t
 
 
-cdef extern from "nvforest/detail/raft_proto/handle.hpp" namespace "raft_proto" nogil:
+cdef extern from "nvforest/handle.hpp" namespace "nvforest" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(const raft_handle_t* handle_ptr) except +
         handle_t(const raft_handle_t& handle) except +
-        raft_proto_stream_t get_next_usable_stream() except +
+        nvforest_stream_t get_next_usable_stream() except +
         void synchronize() except+
diff --git a/python/nvforest/nvforest/detail/raft_proto/__init__.py b/python/nvforest/nvforest/detail/raft_proto/__init__.py
deleted file mode 100644
index b25a9d5..0000000
--- a/python/nvforest/nvforest/detail/raft_proto/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-#
diff --git a/python/nvforest/nvforest/detail/raft_proto/device_type.pxd b/python/nvforest/nvforest/detail/raft_proto/device_type.pxd
deleted file mode 100644
index dc6214c..0000000
--- a/python/nvforest/nvforest/detail/raft_proto/device_type.pxd
+++ /dev/null
@@ -1,8 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-#
-cdef extern from "nvforest/detail/raft_proto/device_type.hpp" namespace "raft_proto" nogil:
-    cdef enum device_type:
-        cpu "raft_proto::device_type::cpu",
-        gpu "raft_proto::device_type::gpu"
diff --git a/python/nvforest/nvforest/detail/raft_proto/optional.pxd b/python/nvforest/nvforest/detail/raft_proto/optional.pxd
deleted file mode 100644
index 54dcac4..0000000
--- a/python/nvforest/nvforest/detail/raft_proto/optional.pxd
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-#
-# The following is taken from
-# https://github.com/cython/cython/blob/master/Cython/Includes/libcpp/optional.pxd,
-# which provides a binding for std::optional in Cython 3.0
-
-from libcpp cimport bool
-
-
-cdef extern from "<optional>" namespace "std" nogil:
-    cdef cppclass nullopt_t:
-        nullopt_t()
-
-    cdef nullopt_t nullopt
-
-    cdef cppclass optional[T]:
-        ctypedef T value_type
-        optional()
-        optional(nullopt_t)
-        optional(optional&) except +
-        optional(T&) except +
-        bool has_value()
-        T& value()
-        T& value_or[U](U& default_value)
-        void swap(optional&)
-        void reset()
-        T& emplace(...)
-        T& operator*()
-        # T* operator->() # Not Supported
-        optional& operator=(optional&)
-        optional& operator=[U](U&)
-        bool operator bool()
-        bool operator!()
-        bool operator==[U](optional&, U&)
-        bool operator!=[U](optional&, U&)
-        bool operator<[U](optional&, U&)
-        bool operator>[U](optional&, U&)
-        bool operator<=[U](optional&, U&)
-        bool operator>=[U](optional&, U&)
-
-    optional[T] make_optional[T](...) except +