From 45bb32f206fb4c9c09854e0a9fca296e4bb482e3 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Thu, 21 Nov 2024 07:45:32 -0800 Subject: [PATCH 1/9] [SYCL] Fixed-size groups and partitions are renamed to "chunks" --- sycl/include/sycl/detail/spirv.hpp | 44 +++++++-------- .../oneapi/experimental/fixed_size_group.hpp | 56 +++++++++---------- .../experimental/non_uniform_groups.hpp | 2 +- .../{fixed_size_group.cpp => chunk.cpp} | 22 ++++---- ...up_algorithms.cpp => chunk_algorithms.cpp} | 31 +++++----- sycl/test/regression/group_algorithms.cpp | 2 +- 6 files changed, 80 insertions(+), 77 deletions(-) rename sycl/test-e2e/NonUniformGroups/{fixed_size_group.cpp => chunk.cpp} (72%) rename sycl/test-e2e/NonUniformGroups/{fixed_size_group_algorithms.cpp => chunk_algorithms.cpp} (88%) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index ad59d129622b9..fc9939cc63e5e 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -26,7 +26,7 @@ namespace oneapi { struct sub_group; namespace experimental { template class ballot_group; -template class fixed_size_group; +template class chunk; template class root_group; template class tangle_group; class opportunistic_group; @@ -78,9 +78,9 @@ struct is_ballot_group< template struct is_fixed_size_group : std::false_type {}; -template -struct is_fixed_size_group> : std::true_type {}; +template +struct is_chunk> : std::true_type {}; template struct group_scope {}; @@ -105,9 +105,9 @@ struct group_scope> { static constexpr __spv::Scope::Flag value = group_scope::value; }; -template -struct group_scope> { +template +struct group_scope> { static constexpr __spv::Scope::Flag value = group_scope::value; }; @@ -174,15 +174,15 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformAll(group_scope::value, pred); } } -template +template bool GroupAll( - ext::oneapi::experimental::fixed_size_group, + ext::oneapi::experimental::chunk, bool pred) { // GroupNonUniformAll doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseAnd( group_scope::value, static_cast(__spv::GroupOperation::ClusteredReduce), - static_cast(pred), PartitionSize); + static_cast(pred), ChunkSize); } template bool GroupAll(ext::oneapi::experimental::tangle_group, bool pred) { @@ -210,15 +210,15 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformAny(group_scope::value, pred); } } -template +template bool GroupAny( - ext::oneapi::experimental::fixed_size_group, + ext::oneapi::experimental::chunk, bool pred) { // GroupNonUniformAny doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseOr( group_scope::value, static_cast(__spv::GroupOperation::ClusteredReduce), - static_cast(pred), PartitionSize); + static_cast(pred), ChunkSize); } template bool GroupAny(ext::oneapi::experimental::tangle_group, bool pred) { @@ -327,12 +327,12 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, WideOCLX, OCLId); } } -template +template EnableIfNativeBroadcast GroupBroadcast( - ext::oneapi::experimental::fixed_size_group g, + ext::oneapi::experimental::chunk g, T x, IdT local_id) { // Remap local_id to its original numbering in ParentGroup - auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; + auto LocalId = g.get_group_linear_id() * ChunkSize + local_id; // TODO: Refactor to avoid duplication after design settles. auto GroupLocalId = static_cast::type>(LocalId); @@ -341,9 +341,9 @@ EnableIfNativeBroadcast GroupBroadcast( auto OCLId = detail::convertToOpenCLType(GroupLocalId); // NonUniformBroadcast requires Id to be dynamically uniform, which does not - // hold here; each partition is broadcasting a separate index. We could + // hold here; each chunk is broadcasting a separate index. We could // fallback to either NonUniformShuffle or a NonUniformBroadcast per - // partition, and it's unclear which will be faster in practice. + // chunk, and it's unclear which will be faster in practice. return __spirv_GroupNonUniformShuffle(group_scope::value, WideOCLX, OCLId); } @@ -1298,12 +1298,10 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { } \ } \ \ - template <__spv::GroupOperation Op, size_t PartitionSize, \ + template <__spv::GroupOperation Op, size_t ChunkSize, \ typename ParentGroup, typename T> \ inline T Group##Instruction( \ - ext::oneapi::experimental::fixed_size_group \ - g, \ - T x) { \ + ext::oneapi::experimental::chunk g, T x) \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = std::conditional_t< \ @@ -1321,7 +1319,7 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { constexpr auto OpInt = \ static_cast(__spv::GroupOperation::ClusteredReduce); \ return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \ - PartitionSize); \ + ChunkSize); \ } else { \ T tmp; \ for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \ diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index 56f30f2091a6b..8a8bda85bc243 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -26,18 +26,18 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { -template class fixed_size_group; +template class chunk; -template +template #ifdef __SYCL_DEVICE_ONLY__ [[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_fixed_size_group)]] #endif inline std::enable_if_t> && std::is_same_v, - fixed_size_group> -get_fixed_size_group(Group group); + chunk> +chunked_partition(Group group); -template class fixed_size_group { +template class chunk { public: using id_type = id<1>; using range_type = range<1>; @@ -47,7 +47,7 @@ template class fixed_size_group { id_type get_group_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupLocalInvocationId() / PartitionSize; + return __spirv_SubgroupLocalInvocationId() / ChunkSize; #else throw exception(make_error_code(errc::runtime), "Non-uniform groups are not supported on host."); @@ -56,7 +56,7 @@ template class fixed_size_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupLocalInvocationId() % PartitionSize; + return __spirv_SubgroupLocalInvocationId() % ChunkSize; #else throw exception(make_error_code(errc::runtime), "Non-uniform groups are not supported on host."); @@ -65,7 +65,7 @@ template class fixed_size_group { range_type get_group_range() const { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupSize() / PartitionSize; + return __spirv_SubgroupSize() / ChunkSize; #else throw exception(make_error_code(errc::runtime), "Non-uniform groups are not supported on host."); @@ -74,7 +74,7 @@ template class fixed_size_group { range_type get_local_range() const { #ifdef __SYCL_DEVICE_ONLY__ - return PartitionSize; + return ChunkSize; #else throw exception(make_error_code(errc::runtime), "Non-uniform groups are not supported on host."); @@ -137,34 +137,34 @@ template class fixed_size_group { fixed_size_group() {} #endif - friend fixed_size_group - get_fixed_size_group(ParentGroup g); + friend chunk + chunked_partition(ParentGroup g); friend sub_group_mask - sycl::detail::GetMask>( - fixed_size_group Group); + sycl::detail::GetMask>( + chunk Group); }; -template +template inline std::enable_if_t> && std::is_same_v, - fixed_size_group> -get_fixed_size_group(Group group) { + chunk> +chunked_partition(Group group) { (void)group; #ifdef __SYCL_DEVICE_ONLY__ #if defined(__NVPTX__) uint32_t loc_id = group.get_local_linear_id(); uint32_t loc_size = group.get_local_linear_range(); - uint32_t bits = PartitionSize == 32 + uint32_t bits = ChunkSize == 32 ? 0xffffffff - : ((1 << PartitionSize) - 1) - << ((loc_id / PartitionSize) * PartitionSize); + : ((1 << ChunkSize) - 1) + << ((loc_id / ChunkSize) * ChunkSize); - return fixed_size_group( + return chunk( sycl::detail::Builder::createSubGroupMask( bits, loc_size)); #else - return fixed_size_group(); + return chunk(); #endif #else throw exception(make_error_code(errc::runtime), @@ -172,22 +172,22 @@ get_fixed_size_group(Group group) { #endif } -template -struct is_user_constructed_group> +template +struct is_user_constructed_group> : std::true_type {}; } // namespace ext::oneapi::experimental namespace detail { -template -struct is_fixed_size_group< - ext::oneapi::experimental::fixed_size_group> +template +struct is_chunk< + ext::oneapi::experimental::chunk> : std::true_type {}; } // namespace detail -template +template struct is_group< - ext::oneapi::experimental::fixed_size_group> + ext::oneapi::experimental::chunk> : std::true_type {}; } // namespace _V1 diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index bbe619834dcdc..4797eb75eff9c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -73,7 +73,7 @@ namespace ext::oneapi::experimental { // Forward declarations of non-uniform group types for algorithm definitions template class ballot_group; -template class fixed_size_group; +template class chunk; template class tangle_group; class opportunistic_group; diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp similarity index 72% rename from sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp rename to sycl/test-e2e/NonUniformGroups/chunk.cpp index 939be57799dd4..36601f9046189 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -13,19 +13,19 @@ #include namespace syclex = sycl::ext::oneapi::experimental; -template class TestKernel; +template class TestKernel; -template void test() { +template void test() { sycl::queue Q; // Test for both the full sub-group size and a case with less work than a full // sub-group. for (size_t WGS : std::array{32, 16}) { - if (WGS < PartitionSize) + if (WGS < ChunkSize) continue; std::cout << "Testing for work size " << WGS << " and partition size " - << PartitionSize << std::endl; + << ChunkSize << std::endl; sycl::buffer MatchBuf{sycl::range{WGS}}; sycl::buffer LeaderBuf{sycl::range{WGS}}; @@ -40,24 +40,24 @@ template void test() { auto SG = item.get_sub_group(); auto SGS = SG.get_local_linear_range(); - auto Partition = syclex::get_fixed_size_group(SG); + auto Partition = syclex::chunked_partition(SG); bool Match = true; - Match &= (Partition.get_group_id() == (WI / PartitionSize)); - Match &= (Partition.get_local_id() == (WI % PartitionSize)); - Match &= (Partition.get_group_range() == (SGS / PartitionSize)); - Match &= (Partition.get_local_range() == PartitionSize); + Match &= (Partition.get_group_id() == (WI / ChunkSize)); + Match &= (Partition.get_local_id() == (WI % ChunkSize)); + Match &= (Partition.get_group_range() == (SGS / ChunkSize)); + Match &= (Partition.get_local_range() == ChunkSize); MatchAcc[WI] = Match; LeaderAcc[WI] = Partition.leader(); }; - CGH.parallel_for>(NDR, KernelFunc); + CGH.parallel_for>(NDR, KernelFunc); }); sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; for (int WI = 0; WI < WGS; ++WI) { assert(MatchAcc[WI] == true); - assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0)); + assert(LeaderAcc[WI] == ((WI % ChunkSize) == 0)); } } } diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp similarity index 88% rename from sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp rename to sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp index c1c172c4189c3..458ee88368fc0 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp @@ -15,9 +15,14 @@ #include namespace syclex = sycl::ext::oneapi::experimental; -template class TestKernel; -template void test() { + + + + +template class TestKernel; + +template void test() { sycl::queue Q; constexpr uint32_t SGSize = 32; @@ -57,25 +62,25 @@ template void test() { auto SG = item.get_sub_group(); // Split into partitions of fixed size - auto Partition = syclex::get_fixed_size_group(SG); + auto Partition = syclex::chunked_partition(SG); // Check all other members' writes are visible after a barrier. TmpAcc[WI] = 1; sycl::group_barrier(Partition); size_t Visible = 0; for (size_t Other = 0; Other < SGSize; ++Other) { - if ((WI / PartitionSize) == (Other / PartitionSize)) { + if ((WI / ChunkSize) == (Other / ChunkSize)) { Visible += TmpAcc[Other]; } } - BarrierAcc[WI] = (Visible == PartitionSize); + BarrierAcc[WI] = (Visible == ChunkSize); // Simple check of group algorithms. uint32_t OriginalLID = SG.get_local_linear_id(); uint32_t LID = Partition.get_local_linear_id(); uint32_t PartitionLeader = - (OriginalLID / PartitionSize) * PartitionSize; + (OriginalLID / ChunkSize) * ChunkSize; uint32_t BroadcastResult = sycl::group_broadcast(Partition, OriginalLID, 0); BroadcastAcc[WI] = (BroadcastResult == PartitionLeader); @@ -83,7 +88,7 @@ template void test() { bool AnyResult = sycl::any_of_group(Partition, (LID == 0)); AnyAcc[WI] = (AnyResult == true); - bool Predicate = ((OriginalLID / PartitionSize) % 2 == 0); + bool Predicate = ((OriginalLID / ChunkSize) % 2 == 0); bool AllResult = sycl::all_of_group(Partition, Predicate); if (Predicate) { AllAcc[WI] = (AllResult == true); @@ -100,7 +105,7 @@ template void test() { uint32_t ReduceResult = sycl::reduce_over_group(Partition, 1, sycl::plus<>()); - ReduceAcc[WI] = (ReduceResult == PartitionSize); + ReduceAcc[WI] = (ReduceResult == ChunkSize); uint32_t ExScanResult = sycl::exclusive_scan_over_group(Partition, 1, sycl::plus<>()); @@ -112,7 +117,7 @@ template void test() { uint32_t ShiftLeftResult = sycl::shift_group_left(Partition, LID, 2); ShiftLeftAcc[WI] = - (LID + 2 >= PartitionSize || ShiftLeftResult == LID + 2); + (LID + 2 >= ChunkSize || ShiftLeftResult == LID + 2); uint32_t ShiftRightResult = sycl::shift_group_right(Partition, LID, 2); @@ -120,16 +125,16 @@ template void test() { uint32_t SelectResult = sycl::select_from_group( Partition, OriginalLID, - (Partition.get_local_id() + 2) % PartitionSize); + (Partition.get_local_id() + 2) % ChunkSize); SelectAcc[WI] = - SelectResult == OriginalLID - LID + ((LID + 2) % PartitionSize); + SelectResult == OriginalLID - LID + ((LID + 2) % ChunkSize); - uint32_t Mask = PartitionSize <= 2 ? 0 : 2; + uint32_t Mask = ChunkSize <= 2 ? 0 : 2; uint32_t PermuteXorResult = sycl::permute_group_by_xor(Partition, LID, Mask); PermuteXorAcc[WI] = (PermuteXorResult == (LID ^ Mask)); }; - CGH.parallel_for>(NDR, KernelFunc); + CGH.parallel_for>(NDR, KernelFunc); }); sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only}; diff --git a/sycl/test/regression/group_algorithms.cpp b/sycl/test/regression/group_algorithms.cpp index 0d9cecc0192db..8e8af2b2fa90b 100644 --- a/sycl/test/regression/group_algorithms.cpp +++ b/sycl/test/regression/group_algorithms.cpp @@ -154,7 +154,7 @@ int main() { TestForGroup(SG); TestForGroup(NDI.get_group()); TestForGroup(syclex::get_ballot_group(SG, true)); - TestForGroup(syclex::get_fixed_size_group<8>(SG)); + TestForGroup(syclex::chunked_partition<8>(SG)); TestForGroup(syclex::get_tangle_group(SG)); TestForGroup(syclex::this_kernel::get_opportunistic_group()); }); From fb1d33d5110f53fb91e863058a75a6487de6767f Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Sun, 24 Nov 2024 01:06:07 -0800 Subject: [PATCH 2/9] SYCL] Fixed-size groups and partitions are renamed to "chunks" --- llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td | 8 ++++---- sycl/include/sycl/detail/spirv.hpp | 12 ++++++------ sycl/include/sycl/detail/type_traits.hpp | 4 ++-- .../{fixed_size_group.hpp => chunk.hpp} | 14 ++++++++------ .../experimental/cuda/non_uniform_algorithms.hpp | 12 ++++++------ sycl/include/sycl/info/aspects.def | 2 +- sycl/include/sycl/sycl.hpp | 2 +- sycl/source/detail/device_impl.cpp | 2 +- sycl/test-e2e/NonUniformGroups/chunk.cpp | 12 ++++++++++-- .../test-e2e/NonUniformGroups/chunk_algorithms.cpp | 9 +++------ .../NonUniformGroups/is_user_constructed.cpp | 6 +++--- 11 files changed, 45 insertions(+), 38 deletions(-) rename sycl/include/sycl/ext/oneapi/experimental/{fixed_size_group.hpp => chunk.hpp} (93%) diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 5fc1cf79a1caa..d0ce4ed8eb7f1 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -71,7 +71,7 @@ def AspectExt_oneapi_bindless_images_sample_1d_usm : Aspect<"ext_oneapi_bindless def AspectExt_oneapi_bindless_images_sample_2d_usm : Aspect<"ext_oneapi_bindless_images_sample_2d_usm">; def AspectExt_intel_esimd : Aspect<"ext_intel_esimd">; def AspectExt_oneapi_ballot_group : Aspect<"ext_oneapi_ballot_group">; -def AspectExt_oneapi_fixed_size_group : Aspect<"ext_oneapi_fixed_size_group">; +def AspectExt_oneapi_chunk : Aspect<"ext_oneapi_chunk">; def AspectExt_oneapi_opportunistic_group : Aspect<"ext_oneapi_opportunistic_group">; def AspectExt_oneapi_tangle_group : Aspect<"ext_oneapi_tangle_group">; def AspectExt_intel_matrix : Aspect<"ext_intel_matrix">; @@ -144,7 +144,7 @@ def : TargetInfo<"__TestAspectList", AspectExt_oneapi_bindless_sampled_image_fetch_2d_usm, AspectExt_oneapi_bindless_sampled_image_fetch_2d, AspectExt_oneapi_bindless_sampled_image_fetch_3d, AspectExt_intel_esimd, - AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, + AspectExt_oneapi_ballot_group, AspectExt_oneapi_chunk, AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph, AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group, @@ -163,7 +163,7 @@ defvar IntelCpuAspects = [ AspectCpu, AspectFp16, AspectFp64, AspectQueue_profiling, AspectAtomic64, AspectExt_oneapi_srgb, AspectExt_oneapi_native_assert, AspectExt_intel_legacy_image, AspectExt_oneapi_ballot_group, - AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, + AspectExt_oneapi_chunk, AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_tangle_group, AspectExt_oneapi_private_alloca ] # AllUSMAspects; @@ -231,7 +231,7 @@ class CudaTargetInfo aspectList, int subGroupSiz defvar CudaMinAspects = !listconcat(AllUSMAspects, [AspectGpu, AspectFp64, AspectOnline_compiler, AspectOnline_linker, AspectQueue_profiling, AspectExt_intel_pci_address, AspectExt_intel_max_mem_bandwidth, AspectExt_intel_memory_bus_width, AspectExt_intel_device_info_uuid, AspectExt_oneapi_native_assert, AspectExt_intel_free_memory, AspectExt_intel_device_id, - AspectExt_intel_memory_clock_rate, AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, + AspectExt_intel_memory_clock_rate, AspectExt_oneapi_ballot_group, AspectExt_oneapi_chunk, AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph]); // Bindless images aspects are partially supported on CUDA and disabled by default at the moment. defvar CudaBindlessImagesAspects = [AspectExt_oneapi_bindless_images, AspectExt_oneapi_bindless_images_shared_usm, diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index fc9939cc63e5e..ad65b9056971d 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -76,7 +76,7 @@ struct is_ballot_group< sycl::ext::oneapi::experimental::ballot_group> : std::true_type {}; -template struct is_fixed_size_group : std::false_type {}; +template struct is_chunk : std::false_type {}; template struct is_chunk local_id) { if constexpr (is_tangle_or_opportunistic_group::value || is_ballot_group::value) return detail::IdToMaskPosition(g, local_id); - else if constexpr (is_fixed_size_group::value) + else if constexpr (is_chunk::value) return g.get_group_linear_id() * g.get_local_range().size() + local_id; else return local_id.get(0); @@ -983,7 +983,7 @@ EnableIfNativeShuffle ShuffleXor(GroupT g, T x, id<1> mask) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (is_fixed_size_group_v) { + if constexpr (is_chunk_v) { return cuda_shfl_sync_bfly_i32(MemberMask, x, static_cast(mask.get(0)), 0x1f); @@ -1031,7 +1031,7 @@ EnableIfNativeShuffle ShuffleDown(GroupT g, T x, uint32_t delta) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (is_fixed_size_group_v) { + if constexpr (is_chunk_v) { return cuda_shfl_sync_down_i32(MemberMask, x, delta, 31); } else { unsigned localSetBit = g.get_local_id()[0] + 1; @@ -1075,7 +1075,7 @@ EnableIfNativeShuffle ShuffleUp(GroupT g, T x, uint32_t delta) { if constexpr (ext::oneapi::experimental::is_user_constructed_group_v< GroupT>) { auto MemberMask = detail::ExtractMask(detail::GetMask(g))[0]; - if constexpr (is_fixed_size_group_v) { + if constexpr (is_chunk_v) { return cuda_shfl_sync_up_i32(MemberMask, x, delta, 0); } else { unsigned localSetBit = g.get_local_id()[0] + 1; @@ -1301,7 +1301,7 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { template <__spv::GroupOperation Op, size_t ChunkSize, \ typename ParentGroup, typename T> \ inline T Group##Instruction( \ - ext::oneapi::experimental::chunk g, T x) \ + ext::oneapi::experimental::chunk g, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = std::conditional_t< \ diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index 63bdc88ff4ba1..18ba5dd7959e1 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -20,10 +20,10 @@ namespace sycl { inline namespace _V1 { namespace detail { -template struct is_fixed_size_group : std::false_type {}; +template struct is_chunk : std::false_type {}; template -inline constexpr bool is_fixed_size_group_v = is_fixed_size_group::value; +inline constexpr bool is_chunk_v = is_chunk::value; template class OperationCurrentT, int... Indexes> diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp similarity index 93% rename from sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp rename to sycl/include/sycl/ext/oneapi/experimental/chunk.hpp index 8a8bda85bc243..4550e3ab965f6 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp @@ -1,4 +1,4 @@ -//==--- fixed_size_group.hpp --- SYCL extension for non-uniform groups -----==// +//==--- chunk.hpp --- SYCL extension for non-uniform groups -----==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,7 +10,7 @@ #include #include -#include // for is_fixed_size_group, is_group +#include // for is_chunk, is_group #include #include #include // for sub_group_mask @@ -30,7 +30,7 @@ template class chunk; template #ifdef __SYCL_DEVICE_ONLY__ -[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_fixed_size_group)]] +[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_chunk)]] #endif inline std::enable_if_t> && std::is_same_v, @@ -44,7 +44,9 @@ template class chunk { using linear_id_type = typename ParentGroup::linear_id_type; static constexpr int dimensions = 1; static constexpr sycl::memory_scope fence_scope = ParentGroup::fence_scope; - + /* ToDo:wd + we don't have fragment (operator fragment() const;) implementation yet. + */ id_type get_group_id() const { #ifdef __SYCL_DEVICE_ONLY__ return __spirv_SubgroupLocalInvocationId() / ChunkSize; @@ -132,9 +134,9 @@ template class chunk { #endif #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - fixed_size_group(ext::oneapi::sub_group_mask mask) : Mask(mask) {} + chunk(ext::oneapi::sub_group_mask mask) : Mask(mask) {} #else - fixed_size_group() {} + chunk() {} #endif friend chunk diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 4b760cf637036..813b691a094bb 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -95,9 +95,9 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, //// Shuffle based masked reduction impls -// fixed_size_group group reduction using shfls +// chunk group reduction using shfls template -inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { @@ -111,7 +111,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !is_fixed_size_group_v, + !is_chunk_v, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -208,10 +208,10 @@ inline __SYCL_ALWAYS_INLINE //// Shuffle based masked reduction impls -// fixed_size_group group scan using shfls +// chunk group scan using shfls template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> -inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> +inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { unsigned localIdVal = g.get_local_id()[0]; @@ -233,7 +233,7 @@ template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !is_fixed_size_group_v, + !is_chunk_v, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index 8a931dde35a71..c2653c66dd186 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -46,7 +46,7 @@ __SYCL_ASPECT(ext_oneapi_mipmap_anisotropy, 51) __SYCL_ASPECT(ext_oneapi_mipmap_level_reference, 52) __SYCL_ASPECT(ext_intel_esimd, 53) __SYCL_ASPECT(ext_oneapi_ballot_group, 54) -__SYCL_ASPECT(ext_oneapi_fixed_size_group, 55) +__SYCL_ASPECT(ext_oneapi_chunk, 55) __SYCL_ASPECT(ext_oneapi_opportunistic_group, 56) __SYCL_ASPECT(ext_oneapi_tangle_group, 57) __SYCL_ASPECT(ext_intel_matrix, 58) diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index 73aa4421d0caa..f081781407f9f 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -93,7 +93,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 178634322f47e..8deae6abd4542 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -658,7 +658,7 @@ bool device_impl::has(aspect Aspect) const { return call_successful && support; } case aspect::ext_oneapi_ballot_group: - case aspect::ext_oneapi_fixed_size_group: + case aspect::ext_oneapi_chunk: case aspect::ext_oneapi_opportunistic_group: { return (this->getBackend() == backend::ext_oneapi_level_zero) || (this->getBackend() == backend::opencl) || diff --git a/sycl/test-e2e/NonUniformGroups/chunk.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp index 36601f9046189..9736af77ff7a0 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -7,10 +7,16 @@ // REQUIRES: cpu || gpu // UNSUPPORTED: hip // REQUIRES: sg-32 +// REQUIRES: aspect-ext_oneapi_chunk -#include -#include #include + +//#ifdef __SYCL_DEVICE_ONLY__ +//[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_chunk)]] + +#include +#include + namespace syclex = sycl::ext::oneapi::experimental; template class TestKernel; @@ -71,3 +77,5 @@ int main() { test<32>(); return 0; } + +//# endif diff --git a/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp index 458ee88368fc0..a32c99632de67 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp @@ -6,19 +6,16 @@ // // REQUIRES: cpu || gpu // REQUIRES: sg-32 -// REQUIRES: aspect-ext_oneapi_fixed_size_group +// REQUIRES: aspect-ext_oneapi_chunk #include -#include +#include #include #include #include -namespace syclex = sycl::ext::oneapi::experimental; - - - +namespace syclex = sycl::ext::oneapi::experimental; template class TestKernel; diff --git a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp index c3ea8b59dc54c..74a028aa39188 100644 --- a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp +++ b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp @@ -2,7 +2,7 @@ // RUN: %{build} -fsyntax-only -o %t.out #include -#include +#include #include #include namespace syclex = sycl::ext::oneapi::experimental; @@ -10,9 +10,9 @@ namespace syclex = sycl::ext::oneapi::experimental; static_assert( syclex::is_user_constructed_group_v>); static_assert(syclex::is_user_constructed_group_v< - syclex::fixed_size_group<1, sycl::sub_group>>); + syclex::chunk<1, sycl::sub_group>>); static_assert(syclex::is_user_constructed_group_v< - syclex::fixed_size_group<2, sycl::sub_group>>); + syclex::chunk<2, sycl::sub_group>>); static_assert( syclex::is_user_constructed_group_v>); static_assert(syclex::is_user_constructed_group_v); From 5e74e5730c3a04a7a751fc1a1c5b3b3b558522b2 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Sun, 24 Nov 2024 02:39:45 -0800 Subject: [PATCH 3/9] SYCL] Fixed-size groups and partitions are renamed to "chunks" --- sycl/include/sycl/detail/spirv.hpp | 30 +++++++++---------- sycl/include/sycl/detail/type_traits.hpp | 5 ++-- .../cuda/non_uniform_algorithms.hpp | 16 +++++----- sycl/include/sycl/sycl.hpp | 2 +- sycl/test-e2e/NonUniformGroups/chunk.cpp | 4 +-- .../NonUniformGroups/chunk_algorithms.cpp | 4 +-- .../NonUniformGroups/is_user_constructed.cpp | 8 ++--- 7 files changed, 32 insertions(+), 37 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index ad65b9056971d..c05d297c03711 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -79,8 +79,8 @@ struct is_ballot_group< template struct is_chunk : std::false_type {}; template -struct is_chunk> : std::true_type {}; +struct is_chunk> + : std::true_type {}; template struct group_scope {}; @@ -106,8 +106,8 @@ struct group_scope> { }; template -struct group_scope> { +struct group_scope< + sycl::ext::oneapi::experimental::chunk> { static constexpr __spv::Scope::Flag value = group_scope::value; }; @@ -175,9 +175,8 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, } } template -bool GroupAll( - ext::oneapi::experimental::chunk, - bool pred) { +bool GroupAll(ext::oneapi::experimental::chunk, + bool pred) { // GroupNonUniformAll doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseAnd( group_scope::value, @@ -211,9 +210,8 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, } } template -bool GroupAny( - ext::oneapi::experimental::chunk, - bool pred) { +bool GroupAny(ext::oneapi::experimental::chunk, + bool pred) { // GroupNonUniformAny doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseOr( group_scope::value, @@ -328,9 +326,9 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, } } template -EnableIfNativeBroadcast GroupBroadcast( - ext::oneapi::experimental::chunk g, - T x, IdT local_id) { +EnableIfNativeBroadcast +GroupBroadcast(ext::oneapi::experimental::chunk g, T x, + IdT local_id) { // Remap local_id to its original numbering in ParentGroup auto LocalId = g.get_group_linear_id() * ChunkSize + local_id; @@ -1298,8 +1296,8 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { } \ } \ \ - template <__spv::GroupOperation Op, size_t ChunkSize, \ - typename ParentGroup, typename T> \ + template <__spv::GroupOperation Op, size_t ChunkSize, typename ParentGroup, \ + typename T> \ inline T Group##Instruction( \ ext::oneapi::experimental::chunk g, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ @@ -1319,7 +1317,7 @@ ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { constexpr auto OpInt = \ static_cast(__spv::GroupOperation::ClusteredReduce); \ return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \ - ChunkSize); \ + ChunkSize); \ } else { \ T tmp; \ for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \ diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index 18ba5dd7959e1..fea9316c48868 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -22,8 +22,7 @@ inline namespace _V1 { namespace detail { template struct is_chunk : std::false_type {}; -template -inline constexpr bool is_chunk_v = is_chunk::value; +template inline constexpr bool is_chunk_v = is_chunk::value; template class OperationCurrentT, int... Indexes> @@ -157,7 +156,7 @@ template struct get_elem_type_unqual> { template class OperationCurrentT, int... Indexes> struct get_elem_type_unqual> { + OperationCurrentT, Indexes...>> { using type = typename get_elem_type_unqual>::type; }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp index 813b691a094bb..3b323229132f0 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp @@ -57,8 +57,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(is_sugeninteger_v || - is_sigeninteger_v)&&IsPlus::value, +std::enable_if_t<(is_sugeninteger_v || is_sigeninteger_v) && + IsPlus::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -66,8 +66,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(is_sugeninteger_v || - is_sigeninteger_v)&&IsBitAND::value, +std::enable_if_t<(is_sugeninteger_v || is_sigeninteger_v) && + IsBitAND::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -75,8 +75,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(is_sugeninteger_v || - is_sigeninteger_v)&&IsBitOR::value, +std::enable_if_t<(is_sugeninteger_v || is_sigeninteger_v) && + IsBitOR::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -84,8 +84,8 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } template -std::enable_if_t<(is_sugeninteger_v || - is_sigeninteger_v)&&IsBitXOR::value, +std::enable_if_t<(is_sugeninteger_v || is_sigeninteger_v) && + IsBitXOR::value, T> masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index f081781407f9f..36d1ba1e45e51 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -89,11 +89,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include diff --git a/sycl/test-e2e/NonUniformGroups/chunk.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp index 9736af77ff7a0..f89cb1592e63c 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -11,7 +11,7 @@ #include -//#ifdef __SYCL_DEVICE_ONLY__ +// #ifdef __SYCL_DEVICE_ONLY__ //[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_chunk)]] #include @@ -78,4 +78,4 @@ int main() { return 0; } -//# endif +// #endif diff --git a/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp index a32c99632de67..762ede0534a60 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk_algorithms.cpp @@ -14,7 +14,6 @@ #include #include - namespace syclex = sycl::ext::oneapi::experimental; template class TestKernel; @@ -76,8 +75,7 @@ template void test() { uint32_t OriginalLID = SG.get_local_linear_id(); uint32_t LID = Partition.get_local_linear_id(); - uint32_t PartitionLeader = - (OriginalLID / ChunkSize) * ChunkSize; + uint32_t PartitionLeader = (OriginalLID / ChunkSize) * ChunkSize; uint32_t BroadcastResult = sycl::group_broadcast(Partition, OriginalLID, 0); BroadcastAcc[WI] = (BroadcastResult == PartitionLeader); diff --git a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp index 74a028aa39188..698df037e4289 100644 --- a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp +++ b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp @@ -9,10 +9,10 @@ namespace syclex = sycl::ext::oneapi::experimental; static_assert( syclex::is_user_constructed_group_v>); -static_assert(syclex::is_user_constructed_group_v< - syclex::chunk<1, sycl::sub_group>>); -static_assert(syclex::is_user_constructed_group_v< - syclex::chunk<2, sycl::sub_group>>); +static_assert( + syclex::is_user_constructed_group_v>); +static_assert( + syclex::is_user_constructed_group_v>); static_assert( syclex::is_user_constructed_group_v>); static_assert(syclex::is_user_constructed_group_v); From 66bafee26c15b0bd290145a242cd43c99dd3c382 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Sun, 24 Nov 2024 03:17:09 -0800 Subject: [PATCH 4/9] SYCL] Fixed-size groups and partitions are renamed to "chunks" + missed formatting --- .../sycl/ext/oneapi/experimental/chunk.hpp | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp index 4550e3ab965f6..16dd0c01e2f46 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/chunk.hpp @@ -34,8 +34,7 @@ template #endif inline std::enable_if_t> && std::is_same_v, - chunk> -chunked_partition(Group group); + chunk> chunked_partition(Group group); template class chunk { public: @@ -44,8 +43,10 @@ template class chunk { using linear_id_type = typename ParentGroup::linear_id_type; static constexpr int dimensions = 1; static constexpr sycl::memory_scope fence_scope = ParentGroup::fence_scope; - /* ToDo:wd - we don't have fragment (operator fragment() const;) implementation yet. + /* ToDo: + + fragment implementation to be done: + operator fragment() const; */ id_type get_group_id() const { #ifdef __SYCL_DEVICE_ONLY__ @@ -142,8 +143,7 @@ template class chunk { friend chunk chunked_partition(ParentGroup g); - friend sub_group_mask - sycl::detail::GetMask>( + friend sub_group_mask sycl::detail::GetMask>( chunk Group); }; @@ -157,10 +157,9 @@ chunked_partition(Group group) { #if defined(__NVPTX__) uint32_t loc_id = group.get_local_linear_id(); uint32_t loc_size = group.get_local_linear_range(); - uint32_t bits = ChunkSize == 32 - ? 0xffffffff - : ((1 << ChunkSize) - 1) - << ((loc_id / ChunkSize) * ChunkSize); + uint32_t bits = ChunkSize == 32 ? 0xffffffff + : ((1 << ChunkSize) - 1) + << ((loc_id / ChunkSize) * ChunkSize); return chunk( sycl::detail::Builder::createSubGroupMask( @@ -182,14 +181,12 @@ struct is_user_constructed_group> namespace detail { template -struct is_chunk< - ext::oneapi::experimental::chunk> +struct is_chunk> : std::true_type {}; } // namespace detail template -struct is_group< - ext::oneapi::experimental::chunk> +struct is_group> : std::true_type {}; } // namespace _V1 From 7379a882ee25a8fb0f777acf9f1409d6b9972a7c Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Mon, 2 Dec 2024 05:04:11 -0800 Subject: [PATCH 5/9] [SYCL] Fixed-size tangle group renamed to "tangle" + formatting --- .../llvm/SYCLLowerIR/DeviceConfigFile.td | 6 ++-- .../sycl_ext_oneapi_tangle.asciidoc | 2 +- sycl/include/sycl/detail/spirv.hpp | 12 +++---- .../experimental/non_uniform_groups.hpp | 2 +- .../{tangle_group.hpp => tangle.hpp} | 32 +++++++++---------- sycl/include/sycl/info/aspects.def | 2 +- sycl/include/sycl/sycl.hpp | 2 +- sycl/source/detail/device_impl.cpp | 4 +-- .../NonUniformGroups/is_user_constructed.cpp | 4 +-- .../{tangle_group.cpp => tangle.cpp} | 6 ++-- ...p_algorithms.cpp => tangle_algorithms.cpp} | 10 +++--- .../no-unsupported-without-info.cpp | 4 +-- sycl/test/regression/group_algorithms.cpp | 2 +- 13 files changed, 44 insertions(+), 44 deletions(-) rename sycl/include/sycl/ext/oneapi/experimental/{tangle_group.hpp => tangle.hpp} (85%) rename sycl/test-e2e/NonUniformGroups/{tangle_group.cpp => tangle.cpp} (94%) rename sycl/test-e2e/NonUniformGroups/{tangle_group_algorithms.cpp => tangle_algorithms.cpp} (96%) diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index d0ce4ed8eb7f1..7d7e4180889ec 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -73,7 +73,7 @@ def AspectExt_intel_esimd : Aspect<"ext_intel_esimd">; def AspectExt_oneapi_ballot_group : Aspect<"ext_oneapi_ballot_group">; def AspectExt_oneapi_chunk : Aspect<"ext_oneapi_chunk">; def AspectExt_oneapi_opportunistic_group : Aspect<"ext_oneapi_opportunistic_group">; -def AspectExt_oneapi_tangle_group : Aspect<"ext_oneapi_tangle_group">; +def AspectExt_oneapi_tangle : Aspect<"ext_oneapi_tangle">; def AspectExt_intel_matrix : Aspect<"ext_intel_matrix">; def AspectExt_oneapi_is_composite : Aspect<"ext_oneapi_is_composite">; def AspectExt_oneapi_is_component : Aspect<"ext_oneapi_is_component">; @@ -145,7 +145,7 @@ def : TargetInfo<"__TestAspectList", AspectExt_oneapi_bindless_sampled_image_fetch_3d, AspectExt_intel_esimd, AspectExt_oneapi_ballot_group, AspectExt_oneapi_chunk, AspectExt_oneapi_opportunistic_group, - AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, + AspectExt_oneapi_tangle, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, AspectExt_oneapi_graph, AspectExt_oneapi_limited_graph, AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group, AspectExt_intel_fpga_task_sequence, @@ -164,7 +164,7 @@ defvar IntelCpuAspects = [ AspectExt_oneapi_srgb, AspectExt_oneapi_native_assert, AspectExt_intel_legacy_image, AspectExt_oneapi_ballot_group, AspectExt_oneapi_chunk, AspectExt_oneapi_opportunistic_group, - AspectExt_oneapi_tangle_group, AspectExt_oneapi_private_alloca + AspectExt_oneapi_tangle, AspectExt_oneapi_private_alloca ] # AllUSMAspects; def : TargetInfo<"spir64", [], [], "", "", 1>; diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_tangle.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_tangle.asciidoc index 7edb1d19b74af..374ffb011231c 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_tangle.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_tangle.asciidoc @@ -318,7 +318,7 @@ branches to safely communicate between all work-items executing the same control flow. NOTE: This differs from the `fragment` returned by `get_opportunistic_group()` -because a `tangle_group` requires the implementation to track group membership. +because a `tangle` requires the implementation to track group membership. Which group type to use will depend on a combination of implementation/backend/device and programmer preference. diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index c05d297c03711..b4597a2b3e82d 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -28,7 +28,7 @@ namespace experimental { template class ballot_group; template class chunk; template class root_group; -template class tangle_group; +template class tangle; class opportunistic_group; } // namespace experimental } // namespace oneapi @@ -62,7 +62,7 @@ struct is_tangle_or_opportunistic_group : std::false_type {}; template struct is_tangle_or_opportunistic_group< - sycl::ext::oneapi::experimental::tangle_group> + sycl::ext::oneapi::experimental::tangle> : std::true_type {}; template <> @@ -112,7 +112,7 @@ struct group_scope< }; template -struct group_scope> { +struct group_scope> { static constexpr __spv::Scope::Flag value = group_scope::value; }; @@ -184,7 +184,7 @@ bool GroupAll(ext::oneapi::experimental::chunk, static_cast(pred), ChunkSize); } template -bool GroupAll(ext::oneapi::experimental::tangle_group, bool pred) { +bool GroupAll(ext::oneapi::experimental::tangle, bool pred) { return __spirv_GroupNonUniformAll(group_scope::value, pred); } @@ -219,7 +219,7 @@ bool GroupAny(ext::oneapi::experimental::chunk, static_cast(pred), ChunkSize); } template -bool GroupAny(ext::oneapi::experimental::tangle_group, bool pred) { +bool GroupAny(ext::oneapi::experimental::tangle, bool pred) { return __spirv_GroupNonUniformAny(group_scope::value, pred); } bool GroupAny(const ext::oneapi::experimental::opportunistic_group &, @@ -347,7 +347,7 @@ GroupBroadcast(ext::oneapi::experimental::chunk g, T x, } template EnableIfNativeBroadcast -GroupBroadcast(ext::oneapi::experimental::tangle_group g, T x, +GroupBroadcast(ext::oneapi::experimental::tangle g, T x, IdT local_id) { // Remap local_id to its original numbering in ParentGroup. auto LocalId = detail::IdToMaskPosition(g, local_id); diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 4797eb75eff9c..0edc731423ba0 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -74,7 +74,7 @@ namespace ext::oneapi::experimental { // Forward declarations of non-uniform group types for algorithm definitions template class ballot_group; template class chunk; -template class tangle_group; +template class tangle; class opportunistic_group; } // namespace ext::oneapi::experimental diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp similarity index 85% rename from sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp rename to sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 59af6bdfc753b..311a0b4e3a257 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -1,4 +1,4 @@ -//==------ tangle_group.hpp --- SYCL extension for non-uniform groups ------==// +//==------ tangle.hpp --- SYCL extension for non-uniform groups ------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -25,17 +25,17 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { -template class tangle_group; +template class tangle; template #ifdef __SYCL_DEVICE_ONLY__ -[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_tangle_group)]] +[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_tangle)]] #endif inline std::enable_if_t> && std::is_same_v, - tangle_group> get_tangle_group(Group group); + tangle> get_tangle(Group group); -template class tangle_group { +template class tangle { public: using id_type = id<1>; using range_type = range<1>; @@ -128,19 +128,19 @@ template class tangle_group { protected: sub_group_mask Mask; - tangle_group(sub_group_mask m) : Mask(m) {} + tangle(sub_group_mask m) : Mask(m) {} - friend tangle_group get_tangle_group(ParentGroup); + friend tangle get_tangle(ParentGroup); - friend sub_group_mask sycl::detail::GetMask>( - tangle_group Group); + friend sub_group_mask sycl::detail::GetMask>( + tangle Group); }; template inline std::enable_if_t> && std::is_same_v, - tangle_group> -get_tangle_group(Group group) { + tangle> +get_tangle(Group group) { (void)group; #ifdef __SYCL_DEVICE_ONLY__ #if defined(__SPIR__) || defined(__SPIRV__) @@ -149,12 +149,12 @@ get_tangle_group(Group group) { // We store the mask here because it is required to calculate IDs, not // because it is required to construct the group. sub_group_mask mask = sycl::ext::oneapi::group_ballot(group, true); - return tangle_group(mask); + return tangle(mask); #elif defined(__NVPTX__) // TODO: Construct from compiler-generated mask. Return an invalid group in - // in the meantime. CUDA devices will report false for the tangle_group + // in the meantime. CUDA devices will report false for the tangle // support aspect so kernels launch should ensure this is never run. - return tangle_group(0); + return tangle(0); #endif #else throw exception(make_error_code(errc::runtime), @@ -164,12 +164,12 @@ get_tangle_group(Group group) { } // namespace this_kernel template -struct is_user_constructed_group> : std::true_type {}; +struct is_user_constructed_group> : std::true_type {}; } // namespace ext::oneapi::experimental template -struct is_group> +struct is_group> : std::true_type {}; } // namespace _V1 diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index c2653c66dd186..dde691014eb62 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -48,7 +48,7 @@ __SYCL_ASPECT(ext_intel_esimd, 53) __SYCL_ASPECT(ext_oneapi_ballot_group, 54) __SYCL_ASPECT(ext_oneapi_chunk, 55) __SYCL_ASPECT(ext_oneapi_opportunistic_group, 56) -__SYCL_ASPECT(ext_oneapi_tangle_group, 57) +__SYCL_ASPECT(ext_oneapi_tangle, 57) __SYCL_ASPECT(ext_intel_matrix, 58) __SYCL_ASPECT(ext_oneapi_is_composite, 59) __SYCL_ASPECT(ext_oneapi_is_component, 60) diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index 36d1ba1e45e51..27f66c2751986 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -103,7 +103,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 2635ea24ce2ce..2fecca4645944 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -664,8 +664,8 @@ bool device_impl::has(aspect Aspect) const { (this->getBackend() == backend::opencl) || (this->getBackend() == backend::ext_oneapi_cuda); } - case aspect::ext_oneapi_tangle_group: { - // TODO: tangle_group is not currently supported for CUDA devices. Add when + case aspect::ext_oneapi_tangle: { + // TODO: tangle is not currently supported for CUDA devices. Add when // implemented. return (this->getBackend() == backend::ext_oneapi_level_zero) || (this->getBackend() == backend::opencl); diff --git a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp index 698df037e4289..ecf5a12324174 100644 --- a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp +++ b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace syclex = sycl::ext::oneapi::experimental; static_assert( @@ -14,5 +14,5 @@ static_assert( static_assert( syclex::is_user_constructed_group_v>); static_assert( - syclex::is_user_constructed_group_v>); + syclex::is_user_constructed_group_v>); static_assert(syclex::is_user_constructed_group_v); diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle.cpp similarity index 94% rename from sycl/test-e2e/NonUniformGroups/tangle_group.cpp rename to sycl/test-e2e/NonUniformGroups/tangle.cpp index 44191955048f3..f428356a3e022 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle.cpp @@ -8,7 +8,7 @@ // UNSUPPORTED: cuda || hip #include -#include +#include #include namespace syclex = sycl::ext::oneapi::experimental; @@ -45,7 +45,7 @@ int main() { // Branches deliberately duplicated to test impact of optimizations. // This only reliably works with optimizations disabled right now. if (item.get_global_id() % 2 == 0) { - auto TangleGroup = syclex::get_tangle_group(SG); + auto TangleGroup = syclex::get_tangle(SG); bool Match = true; Match &= (TangleGroup.get_group_id() == 0); @@ -56,7 +56,7 @@ int main() { MatchAcc[WI] = Match; LeaderAcc[WI] = TangleGroup.leader(); } else { - auto TangleGroup = syclex::get_tangle_group(SG); + auto TangleGroup = syclex::get_tangle(SG); bool Match = true; Match &= (TangleGroup.get_group_id() == 0); diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp similarity index 96% rename from sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp rename to sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp index 7033c4c9e4df5..4f6181b346604 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_algorithms.cpp @@ -6,13 +6,13 @@ // // REQUIRES: cpu || gpu // REQUIRES: sg-32 -// REQUIRES: aspect-ext_oneapi_tangle_group +// REQUIRES: aspect-ext_oneapi_tangle // UNSUPPORTED: cuda || windows // Tangle groups exhibit unpredictable behavior on Windows. // The test is disabled while we investigate the root cause. #include -#include +#include #include #include #include @@ -120,13 +120,13 @@ int main() { // Split into three groups of different sizes, using control flow // Body of each branch is deliberately duplicated if (WI < 4) { - auto Tangle = syclex::get_tangle_group(SG); + auto Tangle = syclex::get_tangle(SG); size_t TangleLeader = 0; size_t TangleSize = 4; auto IsMember = [](size_t Other) { return (Other < 4); }; BranchBody(WI, Tangle, TangleLeader, TangleSize, IsMember); } else if (WI < 24) { - auto Tangle = syclex::get_tangle_group(SG); + auto Tangle = syclex::get_tangle(SG); size_t TangleLeader = 4; size_t TangleSize = 20; auto IsMember = [](size_t Other) { @@ -134,7 +134,7 @@ int main() { }; BranchBody(WI, Tangle, TangleLeader, TangleSize, IsMember); } else /* if WI < 32) */ { - auto Tangle = syclex::get_tangle_group(SG); + auto Tangle = syclex::get_tangle(SG); size_t TangleLeader = 24; size_t TangleSize = 8; auto IsMember = [](size_t Other) { diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp index ec44b32f182f7..b24d8c41225c9 100644 --- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp +++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp @@ -301,8 +301,8 @@ // CHECK-NEXT: NewOffloadDriver/spirv_device_obj_smoke.cpp // CHECK-NEXT: NonUniformGroups/ballot_group.cpp // CHECK-NEXT: NonUniformGroups/opportunistic_group.cpp -// CHECK-NEXT: NonUniformGroups/tangle_group.cpp -// CHECK-NEXT: NonUniformGroups/tangle_group_algorithms.cpp +// CHECK-NEXT: NonUniformGroups/tangle.cpp +// CHECK-NEXT: NonUniformGroups/tangle_algorithms.cpp // CHECK-NEXT: OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp // CHECK-NEXT: OptionalKernelFeatures/large-reqd-work-group-size.cpp // CHECK-NEXT: OptionalKernelFeatures/no-fp64-optimization-declared-aspects.cpp diff --git a/sycl/test/regression/group_algorithms.cpp b/sycl/test/regression/group_algorithms.cpp index 8e8af2b2fa90b..5fce3cf3c03c2 100644 --- a/sycl/test/regression/group_algorithms.cpp +++ b/sycl/test/regression/group_algorithms.cpp @@ -155,7 +155,7 @@ int main() { TestForGroup(NDI.get_group()); TestForGroup(syclex::get_ballot_group(SG, true)); TestForGroup(syclex::chunked_partition<8>(SG)); - TestForGroup(syclex::get_tangle_group(SG)); + TestForGroup(syclex::get_tangle(SG)); TestForGroup(syclex::this_kernel::get_opportunistic_group()); }); return 0; From a71d591477e24b520d64699e0a7d268c1167b300 Mon Sep 17 00:00:00 2001 From: Andrei Zibrov Date: Mon, 2 Dec 2024 06:18:09 -0800 Subject: [PATCH 6/9] [SYCL] applied formatting required by code_formatter CI job --- sycl/include/sycl/detail/spirv.hpp | 3 +-- sycl/include/sycl/ext/oneapi/experimental/tangle.hpp | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index b4597a2b3e82d..c441bc5ca65db 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -62,8 +62,7 @@ struct is_tangle_or_opportunistic_group : std::false_type {}; template struct is_tangle_or_opportunistic_group< - sycl::ext::oneapi::experimental::tangle> - : std::true_type {}; + sycl::ext::oneapi::experimental::tangle> : std::true_type {}; template <> struct is_tangle_or_opportunistic_group< diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 311a0b4e3a257..85303c594ca6a 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -132,8 +132,8 @@ template class tangle { friend tangle get_tangle(ParentGroup); - friend sub_group_mask sycl::detail::GetMask>( - tangle Group); + friend sub_group_mask + sycl::detail::GetMask>(tangle Group); }; template From b34d722e087ed4ed70a3401dea12d0a51cafc88b Mon Sep 17 00:00:00 2001 From: AndreiZibrov Date: Tue, 4 Feb 2025 12:14:03 +0100 Subject: [PATCH 7/9] Update sycl/test-e2e/NonUniformGroups/chunk.cpp Co-authored-by: Alexey Sachkov --- sycl/test-e2e/NonUniformGroups/chunk.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/test-e2e/NonUniformGroups/chunk.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp index f89cb1592e63c..0a3fe258d3fe0 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -77,5 +77,3 @@ int main() { test<32>(); return 0; } - -// #endif From 77d1e4bc0d851cfe0689f548128b3eec689e7968 Mon Sep 17 00:00:00 2001 From: AndreiZibrov Date: Tue, 4 Feb 2025 12:14:16 +0100 Subject: [PATCH 8/9] Update sycl/test-e2e/NonUniformGroups/chunk.cpp Co-authored-by: Alexey Sachkov --- sycl/test-e2e/NonUniformGroups/chunk.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/sycl/test-e2e/NonUniformGroups/chunk.cpp b/sycl/test-e2e/NonUniformGroups/chunk.cpp index 0a3fe258d3fe0..7acb3e0455974 100644 --- a/sycl/test-e2e/NonUniformGroups/chunk.cpp +++ b/sycl/test-e2e/NonUniformGroups/chunk.cpp @@ -11,9 +11,6 @@ #include -// #ifdef __SYCL_DEVICE_ONLY__ -//[[__sycl_detail__::__uses_aspects__(sycl::aspect::ext_oneapi_chunk)]] - #include #include From b6f8b71a73997e939391702d16949aefe7f14708 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 22 May 2025 16:33:35 +0200 Subject: [PATCH 9/9] Update sycl/include/sycl/ext/oneapi/experimental/tangle.hpp Co-authored-by: Alexey Sachkov --- sycl/include/sycl/ext/oneapi/experimental/tangle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp index 85303c594ca6a..ad6a390888a6e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle.hpp @@ -140,7 +140,7 @@ template inline std::enable_if_t> && std::is_same_v, tangle> -get_tangle(Group group) { +entangle(Group group) { (void)group; #ifdef __SYCL_DEVICE_ONLY__ #if defined(__SPIR__) || defined(__SPIRV__)