Skip to content

Commit

Permalink
Deprecate cub::RegBoundScaling and cub::MemBoundScaling (#3685)
Browse files Browse the repository at this point in the history
Co-authored-by: Michael Schellenberger Costa <[email protected]>
  • Loading branch information
fbusato and miscco authored Feb 6, 2025
1 parent dd586a2 commit 0dcbde0
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 47 deletions.
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/scan/exclusive/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ struct policy_hub_t
LOAD_MODIFIER,
STORE_ALGORITHM,
SCAN_ALGORITHM,
cub::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
cub::detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
delay_constructor_t>;

struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
Expand Down
19 changes: 10 additions & 9 deletions cub/cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,16 @@ CUB_NAMESPACE_BEGIN
* @tparam _RADIX_BITS
* The number of radix bits, i.e., log2(bins)
*/
template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
BlockLoadAlgorithm _LOAD_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
RadixRankAlgorithm _RANK_ALGORITHM,
BlockScanAlgorithm _SCAN_ALGORITHM,
int _RADIX_BITS,
typename ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
template <
int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
BlockLoadAlgorithm _LOAD_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
RadixRankAlgorithm _RANK_ALGORITHM,
BlockScanAlgorithm _SCAN_ALGORITHM,
int _RADIX_BITS,
typename ScalingType = detail::RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
struct AgentRadixSortDownsweepPolicy : ScalingType
{
enum
Expand Down
27 changes: 14 additions & 13 deletions cub/cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,20 @@ enum RadixSortStoreAlgorithm
RADIX_SORT_STORE_ALIGNED
};

template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
/** \brief Number of private histograms to use in the ranker;
ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */
int _RANK_NUM_PARTS,
/** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that
support warp-strided key arrangement and count callbacks are supported. */
RadixRankAlgorithm _RANK_ALGORITHM,
BlockScanAlgorithm _SCAN_ALGORITHM,
RadixSortStoreAlgorithm _STORE_ALGORITHM,
int _RADIX_BITS,
typename ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
template <
int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
/** \brief Number of private histograms to use in the ranker;
ignored if the ranking algorithm is not one of RADIX_RANK_MATCH_EARLY_COUNTS_* */
int _RANK_NUM_PARTS,
/** \brief Ranking algorithm used in the onesweep kernel. Only algorithms that
support warp-strided key arrangement and count callbacks are supported. */
RadixRankAlgorithm _RANK_ALGORITHM,
BlockScanAlgorithm _SCAN_ALGORITHM,
RadixSortStoreAlgorithm _STORE_ALGORITHM,
int _RADIX_BITS,
typename ScalingType = detail::RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
struct AgentRadixSortOnesweepPolicy : ScalingType
{
enum
Expand Down
13 changes: 7 additions & 6 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,13 @@ CUB_NAMESPACE_BEGIN
* @tparam _RADIX_BITS
* The number of radix bits, i.e., log2(bins)
*/
template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
CacheLoadModifier _LOAD_MODIFIER,
int _RADIX_BITS,
typename ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
template <
int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
CacheLoadModifier _LOAD_MODIFIER,
int _RADIX_BITS,
typename ScalingType = detail::RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
struct AgentRadixSortUpsweepPolicy : ScalingType
{
enum
Expand Down
15 changes: 8 additions & 7 deletions cub/cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,14 @@ CUB_NAMESPACE_BEGIN
* @tparam _BLOCK_ALGORITHM Cooperative block-wide reduction algorithm to use
* @tparam _LOAD_MODIFIER Cache load modifier for reading input elements
*/
template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
int _VECTOR_LOAD_LENGTH,
BlockReduceAlgorithm _BLOCK_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
typename ScalingType = MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
template <
int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
int _VECTOR_LOAD_LENGTH,
BlockReduceAlgorithm _BLOCK_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
typename ScalingType = detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
struct AgentReducePolicy : ScalingType
{
/// Number of items per vectorized load
Expand Down
19 changes: 10 additions & 9 deletions cub/cub/agent/agent_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,16 @@ CUB_NAMESPACE_BEGIN
* Implementation detail, do not specify directly, requirements on the
* content of this type are subject to breaking change.
*/
template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
BlockLoadAlgorithm _LOAD_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
BlockStoreAlgorithm _STORE_ALGORITHM,
BlockScanAlgorithm _SCAN_ALGORITHM,
typename ScalingType = MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
typename DelayConstructorT = detail::default_delay_constructor_t<ComputeT>>
template <
int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
BlockLoadAlgorithm _LOAD_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
BlockStoreAlgorithm _STORE_ALGORITHM,
BlockScanAlgorithm _SCAN_ALGORITHM,
typename ScalingType = detail::MemBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>,
typename DelayConstructorT = detail::default_delay_constructor_t<ComputeT>>
struct AgentScanPolicy : ScalingType
{
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/tuning/tuning_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ struct policy_hub
LOAD_DEFAULT,
Tuning::store_algorithm,
BLOCK_SCAN_WARP_SCANS,
MemBoundScaling<Tuning::threads, Tuning::items, AccumT>,
cub::detail::MemBoundScaling<Tuning::threads, Tuning::items, AccumT>,
typename Tuning::delay_constructor>;
template <typename Tuning>
static auto select_agent_policy(long) -> typename DefaultPolicy::ScanPolicyT;
Expand Down
11 changes: 10 additions & 1 deletion cub/cub/util_arch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ namespace detail
// The maximum amount of static shared memory available per thread block
// Note that in contrast to dynamic shared memory, static shared memory is still limited to 48 KB
static constexpr ::cuda::std::size_t max_smem_per_block = 48 * 1024;
} // namespace detail

template <int Nominal4ByteBlockThreads, int Nominal4ByteItemsPerThread, typename T>
struct RegBoundScaling
Expand All @@ -137,6 +136,16 @@ struct MemBoundScaling
::cuda::ceil_div(int{detail::max_smem_per_block} / (int{sizeof(T)} * ITEMS_PER_THREAD), 32) * 32);
};

} // namespace detail

template <int Nominal4ByteBlockThreads, int Nominal4ByteItemsPerThread, typename T>
using RegBoundScaling CCCL_DEPRECATED_BECAUSE("Internal implementation detail") =
detail::RegBoundScaling<Nominal4ByteBlockThreads, Nominal4ByteItemsPerThread, T>;

template <int Nominal4ByteBlockThreads, int Nominal4ByteItemsPerThread, typename T>
using MemBoundScaling CCCL_DEPRECATED_BECAUSE("Internal implementation detail") =
detail::RegBoundScaling<Nominal4ByteBlockThreads, Nominal4ByteItemsPerThread, T>;

#endif // Do not document

CUB_NAMESPACE_END

0 comments on commit 0dcbde0

Please sign in to comment.