Skip to content

Commit 81aacfa

Browse files
authoredSep 11, 2024··
[SYCL] Implement max_num_work_groups from the launch queries extension (#14333)
This PR implements the `max_num_work_groups ` query from the `sycl_ext_oneapi_launch_queries` extension. Additionally, this PR introduces changes that overload `ext_oneapi_get_info` for another kernel-queue-specific query - `max_num_work_group_sync` to take extra parameters for local work-group size and dynamic local memory size (in bytes) in order to allow users to pass those runtime resource limiting factors to the query, so they are taken into account in the final group count suggestion.
1 parent 729d6f6 commit 81aacfa

16 files changed

+424
-37
lines changed
 

‎sycl/cmake/modules/FetchUnifiedRuntime.cmake

+7-7
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT)
117117
endfunction()
118118

119119
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
120-
# commit 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5
121-
# Merge: a99dbcee 3abe18cf
122-
# Author: Piotr Balcer <piotr.balcer@intel.com>
123-
# Date: Fri Sep 6 17:21:17 2024 +0200
124-
# Merge pull request #1820 from pbalcer/static-linking
125-
# Add support for static linking of the L0 adapter
126-
set(UNIFIED_RUNTIME_TAG 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5)
120+
# commit eb63d1a21729f6928bb6cccc5f92856b0690aca6
121+
# Merge: e26bba51 45a781f4
122+
# Author: Omar Ahmed <omar.ahmed@codeplay.com>
123+
# Date: Tue Sep 10 12:08:57 2024 +0100
124+
# Merge pull request #1796 from GeorgeWeb/georgi/ur_kernel_max_active_wgs
125+
# [CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda
126+
set(UNIFIED_RUNTIME_TAG eb63d1a21729f6928bb6cccc5f92856b0690aca6)
127127

128128
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
129129
# Due to the use of dependentloadflag and no installer for UMF and hwloc we need

‎sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc

+5-3
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,11 @@ otherwise it is 0.
204204
|Returns the maximum number of work-groups, when the kernel is submitted to the
205205
specified queue with the specified work-group size and the specified amount of
206206
dynamic work-group local memory (in bytes), accounting for any kernel
207-
properties or features. If the kernel can be submitted to the specified queue
208-
without an error, the minimum value returned by this query is 1, otherwise it
209-
is 0.
207+
properties or features. If the specified work-group size is 0, which is
208+
invalid, then the implementation will throw a synchronous exception with the
209+
`errc::invalid` error code. If the kernel can be submitted to the specified
210+
queue without an error, the minimum value returned by this query is 1,
211+
otherwise it is 0.
210212

211213
|===
212214

‎sycl/include/sycl/detail/info_desc_helpers.hpp

+12
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ template <typename T> struct is_queue_info_desc : std::false_type {};
3131
template <typename T> struct is_kernel_info_desc : std::false_type {};
3232
template <typename T>
3333
struct is_kernel_device_specific_info_desc : std::false_type {};
34+
template <typename T>
35+
struct is_kernel_queue_specific_info_desc : std::false_type {};
3436
template <typename T> struct is_event_info_desc : std::false_type {};
3537
template <typename T> struct is_event_profiling_info_desc : std::false_type {};
3638
// Normally we would just use std::enable_if to limit valid get_info template
@@ -134,6 +136,16 @@ struct IsKernelInfo<info::kernel_device_specific::ext_codeplay_num_regs>
134136
#include <sycl/info/ext_intel_device_traits.def>
135137
#include <sycl/info/ext_oneapi_device_traits.def>
136138
#undef __SYCL_PARAM_TRAITS_SPEC
139+
140+
#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT, PiCode) \
141+
template <> \
142+
struct is_##DescType##_info_desc<Namespace::info::DescType::Desc> \
143+
: std::true_type { \
144+
using return_type = Namespace::info::DescType::Desc::return_type; \
145+
};
146+
#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
147+
#undef __SYCL_PARAM_TRAITS_SPEC
148+
137149
#define __SYCL_PARAM_TRAITS_SPEC(DescType, Desc, ReturnT, PiCode) \
138150
template <> \
139151
struct is_backend_info_desc<info::DescType::Desc> : std::true_type { \

‎sycl/include/sycl/ext/oneapi/experimental/root_group.hpp

+2-7
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,8 @@ namespace sycl {
2424
inline namespace _V1 {
2525
namespace ext::oneapi::experimental {
2626

27-
namespace info::kernel_queue_specific {
28-
// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once
29-
// #7598 is merged.
30-
struct max_num_work_group_sync {
31-
using return_type = size_t;
32-
};
33-
} // namespace info::kernel_queue_specific
27+
// See 'sycl/info/kernel_device_specific_traits.def' for the kernel
28+
// device-specific properties that relate to 'root_group'.
3429

3530
template <int Dimensions> class root_group {
3631
public:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
// TODO: Revisit 'max_num_work_group_sync' and align it with the
2+
// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
3+
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
4+
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)

‎sycl/include/sycl/info/info_desc.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,8 @@ struct work_item_progress_capabilities;
247247
#include <sycl/info/ext_codeplay_device_traits.def>
248248
#include <sycl/info/ext_intel_device_traits.def>
249249
#include <sycl/info/ext_oneapi_device_traits.def>
250+
#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
251+
250252
#undef __SYCL_PARAM_TRAITS_SPEC
251253
#undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
252254
} // namespace _V1

‎sycl/include/sycl/kernel.hpp

+22-2
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,29 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
159159
get_info(const device &Device, const range<3> &WGSize) const;
160160

161161
// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension
162-
// once #7598 is merged.
162+
// once #7598 is merged. (regarding the 'max_num_work_group_sync' query)
163+
164+
/// Query queue/launch-specific information from a kernel using the
165+
/// info::kernel_queue_specific descriptor for a specific Queue.
166+
///
167+
/// \param Queue is a valid SYCL queue.
168+
/// \return depends on information being queried.
169+
template <typename Param>
170+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
171+
ext_oneapi_get_info(queue Queue) const;
172+
173+
/// Query queue/launch-specific information from a kernel using the
174+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
175+
/// max_num_work_groups is the only valid descriptor for this function.
176+
///
177+
/// \param Queue is a valid SYCL queue.
178+
/// \param WorkGroupSize is the work-group size the number of work-groups is
179+
/// requested for.
180+
/// \return depends on information being queried.
163181
template <typename Param>
164-
typename Param::return_type ext_oneapi_get_info(const queue &q) const;
182+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
183+
ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
184+
size_t DynamicLocalMemorySize) const;
165185

166186
private:
167187
/// Constructs a SYCL kernel object from a valid kernel_impl instance.

‎sycl/source/detail/kernel_impl.cpp

+32
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,38 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
106106
"interoperability function or to query a device built-in kernel");
107107
}
108108

109+
bool kernel_impl::exceedsOccupancyResourceLimits(
110+
const device &Device, const range<3> &WorkGroupSize,
111+
size_t DynamicLocalMemorySize) const {
112+
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
113+
// Generally, exceeding hardware resource limits will yield in an error when
114+
// the kernel is launched.
115+
const size_t MaxWorkGroupSize =
116+
get_info<info::kernel_device_specific::work_group_size>(Device);
117+
const size_t MaxLocalMemorySizeInBytes =
118+
Device.get_info<info::device::local_mem_size>();
119+
120+
if (WorkGroupSize.size() > MaxWorkGroupSize)
121+
return true;
122+
123+
if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
124+
return true;
125+
126+
// It will be impossible to launch a kernel for Cuda when the hardware limit
127+
// for the 32-bit registers page file size is exceeded.
128+
if (Device.get_backend() == backend::ext_oneapi_cuda) {
129+
const uint32_t RegsPerWorkItem =
130+
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
131+
const uint32_t MaxRegsPerWorkGroup =
132+
Device.get_info<ext::codeplay::experimental::info::device::
133+
max_registers_per_work_group>();
134+
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
135+
return true;
136+
}
137+
138+
return false;
139+
}
140+
109141
template <>
110142
typename info::platform::version::return_type
111143
kernel_impl::get_backend_info<info::platform::version>() const {

‎sycl/source/detail/kernel_impl.hpp

+80-10
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,26 @@ class kernel_impl {
114114
typename Param::return_type get_info(const device &Device,
115115
const range<3> &WGSize) const;
116116

117+
/// Query queue/launch-specific information from a kernel using the
118+
/// info::kernel_queue_specific descriptor for a specific Queue.
119+
///
120+
/// \param Queue is a valid SYCL queue.
121+
/// \return depends on information being queried.
122+
template <typename Param>
123+
typename Param::return_type ext_oneapi_get_info(queue Queue) const;
124+
125+
/// Query queue/launch-specific information from a kernel using the
126+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
127+
/// max_num_work_groups is the only valid descriptor for this function.
128+
///
129+
/// \param Queue is a valid SYCL queue.
130+
/// \param WorkGroupSize is the work-group size the number of work-groups is
131+
/// requested for.
132+
/// \return depends on information being queried.
117133
template <typename Param>
118-
typename Param::return_type ext_oneapi_get_info(const queue &q) const;
134+
typename Param::return_type
135+
ext_oneapi_get_info(queue Queue, const range<3> &MaxWorkGroupSize,
136+
size_t DynamicLocalMemorySize) const;
119137

120138
/// Get a constant reference to a raw kernel object.
121139
///
@@ -171,6 +189,12 @@ class kernel_impl {
171189

172190
bool isBuiltInKernel(const device &Device) const;
173191
void checkIfValidForNumArgsInfoQuery() const;
192+
193+
/// Check if the occupancy limits are exceeded for the given kernel launch
194+
/// configuration.
195+
bool exceedsOccupancyResourceLimits(const device &Device,
196+
const range<3> &WorkGroupSize,
197+
size_t DynamicLocalMemorySize) const;
174198
};
175199

176200
template <typename Param>
@@ -217,20 +241,66 @@ kernel_impl::get_info(const device &Device,
217241
getPlugin());
218242
}
219243

244+
namespace syclex = ext::oneapi::experimental;
245+
220246
template <>
221-
inline typename ext::oneapi::experimental::info::kernel_queue_specific::
222-
max_num_work_group_sync::return_type
247+
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
248+
return_type
223249
kernel_impl::ext_oneapi_get_info<
224-
ext::oneapi::experimental::info::kernel_queue_specific::
225-
max_num_work_group_sync>(const queue &Queue) const {
250+
syclex::info::kernel_queue_specific::max_num_work_groups>(
251+
queue Queue, const range<3> &WorkGroupSize,
252+
size_t DynamicLocalMemorySize) const {
253+
if (WorkGroupSize.size() == 0)
254+
throw exception(sycl::make_error_code(errc::invalid),
255+
"The launch work-group size cannot be zero.");
256+
226257
const auto &Plugin = getPlugin();
227258
const auto &Handle = getHandleRef();
259+
auto Device = Queue.get_device();
260+
261+
uint32_t GroupCount{0};
262+
if (auto Result = Plugin->call_nocheck<
263+
UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
264+
Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
265+
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
266+
// The feature is supported. Check for other errors and throw if any.
267+
Plugin->checkUrResult(Result);
268+
return GroupCount;
269+
}
270+
271+
// Fallback. If the backend API is unsupported, this query will return either
272+
// 0 or 1 based on the kernel resource usage and the user-requested resources.
273+
return exceedsOccupancyResourceLimits(Device, WorkGroupSize,
274+
DynamicLocalMemorySize)
275+
? 0
276+
: 1;
277+
}
278+
279+
template <>
280+
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
281+
return_type
282+
kernel_impl::ext_oneapi_get_info<
283+
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
284+
queue Queue, const range<3> &WorkGroupSize,
285+
size_t DynamicLocalMemorySize) const {
286+
return ext_oneapi_get_info<
287+
syclex::info::kernel_queue_specific::max_num_work_groups>(
288+
Queue, WorkGroupSize, DynamicLocalMemorySize);
289+
}
290+
291+
template <>
292+
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
293+
return_type
294+
kernel_impl::ext_oneapi_get_info<
295+
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
296+
queue Queue) const {
297+
auto Device = Queue.get_device();
228298
const auto MaxWorkGroupSize =
229-
Queue.get_device().get_info<info::device::max_work_group_size>();
230-
uint32_t GroupCount = 0;
231-
Plugin->call<UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
232-
Handle, MaxWorkGroupSize, /* DynamicSharedMemorySize */ 0, &GroupCount);
233-
return GroupCount;
299+
get_info<info::kernel_device_specific::work_group_size>(Device);
300+
const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
301+
return ext_oneapi_get_info<
302+
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
303+
Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
234304
}
235305

236306
} // namespace detail

‎sycl/source/kernel.cpp

+23-3
Original file line numberDiff line numberDiff line change
@@ -106,16 +106,36 @@ kernel::get_info<info::kernel_device_specific::max_sub_group_size>(
106106
const device &, const sycl::range<3> &) const;
107107

108108
template <typename Param>
109-
typename Param::return_type
110-
kernel::ext_oneapi_get_info(const queue &Queue) const {
109+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
110+
kernel::ext_oneapi_get_info(queue Queue) const {
111111
return impl->ext_oneapi_get_info<Param>(Queue);
112112
}
113113

114+
template <typename Param>
115+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
116+
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
117+
size_t DynamicLocalMemorySize) const {
118+
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
119+
DynamicLocalMemorySize);
120+
}
121+
114122
template __SYCL_EXPORT typename ext::oneapi::experimental::info::
115123
kernel_queue_specific::max_num_work_group_sync::return_type
116124
kernel::ext_oneapi_get_info<
117125
ext::oneapi::experimental::info::kernel_queue_specific::
118-
max_num_work_group_sync>(const queue &Queue) const;
126+
max_num_work_group_sync>(queue Queue) const;
127+
128+
#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT) \
129+
template __SYCL_EXPORT ReturnT \
130+
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
131+
queue, const range<3> &, size_t) const;
132+
// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
133+
// kernel_queue_specific queries require the above-defined get_info interface.
134+
// clang-format off
135+
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
136+
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
137+
// clang-format on
138+
#undef __SYCL_PARAM_TRAITS_SPEC
119139

120140
kernel::kernel(std::shared_ptr<detail::kernel_impl> Impl) : impl(Impl) {}
121141

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include <sycl/detail/core.hpp>
5+
6+
#include <cassert>
7+
#include <cstdint>
8+
9+
namespace syclex = sycl::ext::oneapi::experimental;
10+
using namespace sycl::info::device;
11+
using namespace sycl::info::kernel_device_specific;
12+
13+
using value_type = int64_t;
14+
15+
namespace kernels {
16+
17+
template <class T, size_t Dim>
18+
using sycl_global_accessor =
19+
sycl::accessor<T, Dim, sycl::access::mode::read_write,
20+
sycl::access::target::global_buffer>;
21+
22+
class TestKernel {
23+
public:
24+
static constexpr bool HasLocalMemory{false};
25+
26+
TestKernel(sycl_global_accessor<value_type, 1> acc) : acc_{acc} {}
27+
28+
void operator()(sycl::nd_item<1> item) const {
29+
const auto gtid = item.get_global_linear_id();
30+
acc_[gtid] = gtid + 42;
31+
}
32+
33+
private:
34+
sycl_global_accessor<value_type, 1> acc_;
35+
};
36+
37+
class TestLocalMemoryKernel {
38+
public:
39+
static constexpr bool HasLocalMemory{true};
40+
41+
TestLocalMemoryKernel(sycl_global_accessor<value_type, 1> acc,
42+
sycl::local_accessor<value_type, 1> loc_acc)
43+
: acc_{acc}, loc_acc_{loc_acc} {}
44+
45+
void operator()(sycl::nd_item<1> item) const {
46+
const auto ltid = item.get_local_linear_id();
47+
const auto gtid = item.get_global_linear_id();
48+
if (ltid < loc_acc_.size()) {
49+
loc_acc_[ltid] = ltid + 42;
50+
item.barrier(sycl::access::fence_space::local_space);
51+
acc_[gtid] = loc_acc_[ltid];
52+
} else {
53+
acc_[gtid] = 0;
54+
}
55+
}
56+
57+
private:
58+
sycl_global_accessor<value_type, 1> acc_;
59+
sycl::local_accessor<value_type, 1> loc_acc_;
60+
};
61+
62+
} // namespace kernels
63+
64+
namespace {
65+
66+
template <class KernelName>
67+
int test_max_num_work_groups(sycl::queue &q, const sycl::device &dev) {
68+
const auto ctx = q.get_context();
69+
auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
70+
auto kernel = bundle.template get_kernel<KernelName>();
71+
72+
const size_t maxWorkGroupSize =
73+
kernel.template get_info<work_group_size>(dev);
74+
const size_t NumWorkItems = maxWorkGroupSize * maxWorkGroupSize;
75+
76+
size_t workGroupSize = 32;
77+
size_t localMemorySizeInBytes{0};
78+
if constexpr (KernelName::HasLocalMemory) {
79+
localMemorySizeInBytes = workGroupSize * sizeof(value_type);
80+
}
81+
82+
sycl::buffer<value_type, 1> buf{sycl::range<1>{NumWorkItems}};
83+
84+
// Tests
85+
86+
// ==================== //
87+
// Test 1 - return type //
88+
// ==================== //
89+
sycl::range<3> workGroupRange{workGroupSize, 1, 1};
90+
auto maxWGs = kernel.template ext_oneapi_get_info<
91+
syclex::info::kernel_queue_specific::max_num_work_groups>(
92+
q, workGroupRange, localMemorySizeInBytes);
93+
94+
// Test the return type is as specified in the extension document.
95+
static_assert(std::is_same_v<std::remove_cv_t<decltype(maxWGs)>, size_t>,
96+
"max_num_work_groups query must return size_t");
97+
98+
// ===================== //
99+
// Test 2 - return value //
100+
// ===================== //
101+
// We must have at least one active group if we are below resource limits.
102+
assert(maxWGs > 0 && "max_num_work_groups query failed");
103+
if (maxWGs == 0)
104+
return 1;
105+
106+
// Run the kernel
107+
auto launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems},
108+
sycl::range<1>{workGroupSize}};
109+
q.submit([&](sycl::handler &cgh) {
110+
auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
111+
if constexpr (KernelName::HasLocalMemory) {
112+
sycl::local_accessor<value_type, 1> loc_acc{
113+
sycl::range<1>{workGroupSize}, cgh};
114+
cgh.parallel_for(launch_range, KernelName{acc, loc_acc});
115+
} else {
116+
cgh.parallel_for(launch_range, KernelName{acc});
117+
}
118+
}).wait();
119+
assert(sycl::host_accessor{buf}[0] == 42);
120+
121+
// ========================== //
122+
// Test 3 - use max resources //
123+
// ========================== //
124+
// A little over the maximum work-group size for the purpose of exceeding.
125+
workGroupSize = maxWorkGroupSize;
126+
workGroupRange[0] = workGroupSize;
127+
size_t localSize =
128+
(dev.get_info<sycl::info::device::local_mem_size>() / sizeof(value_type));
129+
if constexpr (KernelName::HasLocalMemory) {
130+
localMemorySizeInBytes = localSize * sizeof(value_type);
131+
}
132+
maxWGs = kernel.template ext_oneapi_get_info<
133+
syclex::info::kernel_queue_specific::max_num_work_groups>(
134+
q, workGroupRange, localMemorySizeInBytes);
135+
136+
assert(maxWGs > 0 && "max_num_work_groups query failed");
137+
if (maxWGs == 0)
138+
return 1;
139+
140+
launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems},
141+
sycl::range<1>{workGroupSize}};
142+
143+
q.submit([&](sycl::handler &cgh) {
144+
auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
145+
if constexpr (KernelName::HasLocalMemory) {
146+
sycl::local_accessor<value_type, 1> loc_acc{sycl::range<1>{localSize},
147+
cgh};
148+
cgh.parallel_for(launch_range, KernelName{acc, loc_acc});
149+
} else {
150+
cgh.parallel_for(launch_range, KernelName{acc});
151+
}
152+
}).wait();
153+
assert(sycl::host_accessor{buf}[0] == 42);
154+
155+
// =============================== //
156+
// Test 4 - exceed resource limits //
157+
// =============================== //
158+
workGroupSize = maxWorkGroupSize + 32;
159+
workGroupRange[0] = workGroupSize;
160+
maxWGs = kernel.template ext_oneapi_get_info<
161+
syclex::info::kernel_queue_specific::max_num_work_groups>(
162+
q, workGroupRange, localMemorySizeInBytes);
163+
// It cannot be possible to launch a kernel successfully with a configuration
164+
// that exceeds the available resources as in the above defined workGroupSize.
165+
// workGroupSize is larger than maxWorkGroupSize, hence maxWGs must equal 0.
166+
if (dev.get_backend() == sycl::backend::ext_oneapi_cuda) {
167+
assert(maxWGs == 0 && "max_num_work_groups query failed");
168+
if (maxWGs > 0)
169+
return 1;
170+
}
171+
172+
// As we ensured that the 'max_num_work_groups' query correctly
173+
// returns 0 possible work-groups, test that the kernel launch will fail.
174+
// A configuration that defines a work-group size larger than the maximum
175+
// possible should result in failure.
176+
try {
177+
launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems},
178+
sycl::range<1>{workGroupSize}};
179+
180+
q.submit([&](sycl::handler &cgh) {
181+
auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
182+
if constexpr (KernelName::HasLocalMemory) {
183+
sycl::local_accessor<value_type, 1> loc_acc{sycl::range<1>{localSize},
184+
cgh};
185+
cgh.parallel_for(launch_range, KernelName{acc, loc_acc});
186+
} else {
187+
cgh.parallel_for(launch_range, KernelName{acc});
188+
}
189+
}).wait();
190+
} catch (const sycl::exception &e) {
191+
// 'nd_range' error is the expected outcome from the above launch config.
192+
if (e.code() == sycl::make_error_code(sycl::errc::nd_range)) {
193+
return 0;
194+
}
195+
std::cerr << e.code() << ":\t";
196+
std::cerr << e.what() << std::endl;
197+
return 1;
198+
}
199+
200+
return 0;
201+
}
202+
203+
} // namespace
204+
205+
int main() {
206+
sycl::queue q{};
207+
sycl::device dev = q.get_device();
208+
209+
using namespace kernels;
210+
211+
int ret{0};
212+
ret &= test_max_num_work_groups<TestKernel>(q, dev);
213+
ret &= test_max_num_work_groups<TestLocalMemoryKernel>(q, dev);
214+
return ret;
215+
}

‎sycl/test-e2e/GroupAlgorithm/root_group.cpp

+12-3
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,21 @@ void testQueriesAndProperties() {
3030
const auto maxWGs = kernel.ext_oneapi_get_info<
3131
sycl::ext::oneapi::experimental::info::kernel_queue_specific::
3232
max_num_work_group_sync>(q);
33+
const auto wgRange = sycl::range{WorkGroupSize, 1, 1};
34+
const auto maxWGsWithLimits = kernel.ext_oneapi_get_info<
35+
sycl::ext::oneapi::experimental::info::kernel_queue_specific::
36+
max_num_work_group_sync>(q, wgRange, wgRange.size() * sizeof(int));
3337
const auto props = sycl::ext::oneapi::experimental::properties{
3438
sycl::ext::oneapi::experimental::use_root_sync};
3539
q.single_task<class QueryKernel>(props, []() {});
36-
static_assert(std::is_same_v<std::remove_cv<decltype(maxWGs)>::type, size_t>,
37-
"max_num_work_group_sync query must return size_t");
38-
assert(maxWGs >= 1 && "max_num_work_group_sync query failed");
40+
41+
static auto check_max_num_work_group_sync = [](auto Result) {
42+
static_assert(std::is_same_v<std::remove_cv_t<decltype(Result)>, size_t>,
43+
"max_num_work_group_sync query must return size_t");
44+
assert(Result >= 1 && "max_num_work_group_sync query failed");
45+
};
46+
check_max_num_work_group_sync(maxWGs);
47+
check_max_num_work_group_sync(maxWGsWithLimits);
3948
}
4049

4150
void testRootGroup() {

‎sycl/test/abi/sycl_symbols_linux.dump

+3-1
Original file line numberDiff line numberDiff line change
@@ -3915,8 +3915,10 @@ _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info6device15backend_versionEEENS0_6
39153915
_ZNK4sycl3_V16kernel16get_backend_infoINS0_4info6device7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
39163916
_ZNK4sycl3_V16kernel16get_backend_infoINS0_4info8platform7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
39173917
_ZNK4sycl3_V16kernel17get_kernel_bundleEv
3918-
_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENT_11return_typeERKNS0_5queueE
39193918
_ZNK4sycl3_V16kernel3getEv
3919+
_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueE
3920+
_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueERKNS0_5rangeILi3EEEm
3921+
_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific19max_num_work_groupsEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueERKNS0_5rangeILi3EEEm
39203922
_ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific15work_group_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE
39213923
_ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific16global_work_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE
39223924
_ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific16private_mem_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE

‎sycl/test/abi/sycl_symbols_windows.dump

+3-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
??$create_sub_devices@$0BAIH@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@_KV?$allocator@_K@std@@@4@@Z
1414
??$create_sub_devices@$0BAII@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@W4partition_affinity_domain@info@12@@Z
1515
??$create_sub_devices@$0BAIJ@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ
16-
??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KAEBVqueue@12@@Z
16+
??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@@Z
17+
??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@AEBV?$range@$02@12@_K@Z
18+
??$ext_oneapi_get_info@Umax_num_work_groups@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@AEBV?$range@$02@12@_K@Z
1719
??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
1820
??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
1921
??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@event@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ

‎sycl/test/include_deps/sycl_accessor.hpp.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@
110110
// CHECK-NEXT: info/ext_codeplay_device_traits.def
111111
// CHECK-NEXT: info/ext_intel_device_traits.def
112112
// CHECK-NEXT: info/ext_oneapi_device_traits.def
113+
// CHECK-NEXT: info/ext_oneapi_kernel_queue_specific_traits.def
113114
// CHECK-NEXT: info/sycl_backend_traits.def
114115
// CHECK-NEXT: platform.hpp
115116
// CHECK-NEXT: detail/string_view.hpp

‎sycl/test/include_deps/sycl_detail_core.hpp.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@
111111
// CHECK-NEXT: info/ext_codeplay_device_traits.def
112112
// CHECK-NEXT: info/ext_intel_device_traits.def
113113
// CHECK-NEXT: info/ext_oneapi_device_traits.def
114+
// CHECK-NEXT: info/ext_oneapi_kernel_queue_specific_traits.def
114115
// CHECK-NEXT: info/sycl_backend_traits.def
115116
// CHECK-NEXT: platform.hpp
116117
// CHECK-NEXT: detail/string_view.hpp

0 commit comments

Comments
 (0)
Please sign in to comment.