Skip to content

Handler-less kernel submit API #19294

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 27 commits into
base: sycl
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3223842
[SYCL] Handler-less kernel submit API
slawekptak Jul 3, 2025
fde19ca
Fix formatting
slawekptak Jul 3, 2025
13424de
Fix formatting
slawekptak Jul 4, 2025
fbc789d
Change the ExtendedSubmissionInfo to KernelRuntimeInfo,
slawekptak Jul 7, 2025
591b3ec
Added copy/move constructor and assignment operator
slawekptak Jul 8, 2025
d235b7c
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Jul 8, 2025
6641601
Add a no event submit and no handler compile flag
slawekptak Jul 11, 2025
0f41d5a
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Jul 14, 2025
a6e711e
Added a new configure option to build no handler submit path, changed
slawekptak Jul 14, 2025
9c8040e
Host task dependency test
slawekptak Jul 17, 2025
31cbdb9
Add a check for special captures
slawekptak Jul 18, 2025
c5cd091
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 11, 2025
998d592
Switch to the common kernel wrappers, fix the KRInfo function call
slawekptak Aug 11, 2025
4000c07
Enable no handler in the preview lib build, add no handler unit
slawekptak Aug 12, 2025
f8e9cd6
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 12, 2025
01af8bb
Unused argument fix and IsTopCodeLoc assignment
slawekptak Aug 12, 2025
4469e59
Implemented the barrier and un-enqueued commands synchronization
slawekptak Aug 13, 2025
ac1a5cf
Fix formatting
slawekptak Aug 13, 2025
5865f3a
Fixed #ifdef, added comment to a new function.
slawekptak Aug 13, 2025
072803c
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 19, 2025
27b3110
Address review comments
slawekptak Aug 20, 2025
9041e94
Updated Linux symbols
slawekptak Aug 21, 2025
ac2c5bb
Addressed more review comments
slawekptak Aug 21, 2025
8e155fb
Fix formatting
slawekptak Aug 21, 2025
502f637
Fix formatting, remove unused properties argument
slawekptak Aug 21, 2025
d708c93
Fix ProcessKernelRuntimeInfo call
slawekptak Aug 21, 2025
e9f6e4e
Fix unit test build and ProcessKernelRuntimeInfo calls
slawekptak Aug 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions sycl/cmake/modules/AddSYCLUnitTest.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Internal function to create SYCL unit tests with code reuse
# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview file1.cpp, file2.cpp ...)
function(add_sycl_unittest_internal test_dirname link_variant is_preview)
# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview is_no_cgh file1.cpp, file2.cpp ...)
function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_cgh)
# Enable exception handling for these unit tests
set(LLVM_REQUIRES_EH ON)
set(LLVM_REQUIRES_RTTI ON)
Expand Down Expand Up @@ -34,7 +34,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
# Chaning CMAKE_CURRENT_BINARY_DIR should not affect this variable in its
# parent scope.
if (${is_preview})
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview")
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview")
endif()

if (${is_no_cgh})
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/NoCGH")
endif()

if ("${link_variant}" MATCHES "SHARED")
Expand Down Expand Up @@ -65,6 +69,18 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
set(sycl_cache_suffix "_preview")
endif()

if (${is_no_cgh})
set(sycl_cache_suffix "_no_cgh")
endif()

if (${is_no_cgh})
target_compile_definitions(
${test_dirname}
PRIVATE
__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
)
endif()

if (SYCL_ENABLE_XPTI_TRACING)
target_compile_definitions(${test_dirname}
PRIVATE XPTI_ENABLE_INSTRUMENTATION XPTI_STATIC_LIBRARY)
Expand Down Expand Up @@ -149,7 +165,6 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
-Wno-inconsistent-missing-override
)
endif()

target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING)
endfunction()

Expand All @@ -159,6 +174,7 @@ endfunction()
# the SYCL preview features enabled.
# Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)`
macro(add_sycl_unittest test_name_prefix link_variant)
add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_no_cgh ${link_variant} FALSE TRUE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN})
endmacro()
65 changes: 65 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,23 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props,
return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT>(
Props, detail::type_erased_cgfo_ty{CGF}, CodeLoc);
}

template <typename KernelName, typename PropertiesT, typename KernelType,
int Dims>
void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range<Dims> Range,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having lots of layers of tiny template helpers is bad for compile time, why can't it be inlined?

Ideally, most of interfaces accepting the kernel type as a template param must process compile-time properties immediately and only call interfaces that accept type-erased kernel.

Additionally, less tiny layers makes the code much easier to read.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code follows the convention which is there for handler based submissions. We should probably refactor the entire file in a separate PR (after this PR is merged).

const KernelType &KernelFunc,
const sycl::detail::code_location &CodeLoc) {
Q.submit_direct_without_event<KernelName, PropertiesT, KernelType, Dims>(
Props, Range, KernelFunc, CodeLoc);
}
template <typename KernelName, typename PropertiesT, typename KernelType,
int Dims>
event submit_direct_with_event_impl(
const queue &Q, PropertiesT Props, nd_range<Dims> Range,
const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) {
return Q.submit_direct_with_event<KernelName, PropertiesT, KernelType, Dims>(
Props, Range, KernelFunc, CodeLoc);
}
} // namespace detail

template <typename CommandGroupFunc, typename PropertiesT>
Expand All @@ -128,6 +145,17 @@ void submit(const queue &Q, CommandGroupFunc &&CGF,
submit(Q, empty_properties_t{}, std::forward<CommandGroupFunc>(CGF), CodeLoc);
}

template <typename KernelName = sycl::detail::auto_name, typename PropertiesT,
typename KernelType, int Dims>
void submit(const queue &Q, PropertiesT Props, nd_range<Dims> Range,
const KernelType &KernelFunc,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not rvalue?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like the convention in this file is to pass the KernelFunc as lvalue reference. Maybe it would make sense to change it everywhere in a separate PR, for consistency.

const sycl::detail::code_location &CodeLoc =
sycl::detail::code_location::current()) {
sycl::ext::oneapi::experimental::detail::submit_direct_impl<
KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc,
CodeLoc);
}

template <typename CommandGroupFunc, typename PropertiesT>
event submit_with_event(const queue &Q, PropertiesT Props,
CommandGroupFunc &&CGF,
Expand All @@ -145,6 +173,17 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF,
std::forward<CommandGroupFunc>(CGF), CodeLoc);
}

template <typename KernelName = sycl::detail::auto_name, typename PropertiesT,
typename KernelType, int Dims>
event submit_with_event(const queue &Q, PropertiesT Props, nd_range<Dims> Range,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't find where this function is used. Could you please clarify?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is part of the API to be called by the app, when an event is needed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So that's for unimplemented yet part of code. Thanks!

const KernelType &KernelFunc,
const sycl::detail::code_location &CodeLoc =
sycl::detail::code_location::current()) {
return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl<
KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc,
CodeLoc);
}

template <typename KernelName = sycl::detail::auto_name, typename KernelType>
void single_task(handler &CGH, const KernelType &KernelObj) {
CGH.single_task<KernelName>(KernelObj);
Expand Down Expand Up @@ -261,10 +300,21 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
typename KernelType, typename... ReductionsT>
void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
ReductionsT &&...Reductions) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
if constexpr (sizeof...(ReductionsT) == 0) {
submit<KernelName>(std::move(Q), empty_properties_t{}, Range, KernelObj);
} else {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
}
#else
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
#endif
}

template <typename KernelName = sycl::detail::auto_name, int Dimensions,
Expand All @@ -285,10 +335,25 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
typename Properties, typename KernelType, typename... ReductionsT>
void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
const KernelType &KernelObj, ReductionsT &&...Reductions) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
if constexpr (sizeof...(ReductionsT) == 0) {
ext::oneapi::experimental::detail::LaunchConfigAccess<nd_range<Dimensions>,
Properties>
ConfigAccess(Config);
submit<KernelName>(std::move(Q), ConfigAccess.getProperties(),
ConfigAccess.getRange(), KernelObj);
} else {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Config, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
}
#else
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Config, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
#endif
}

template <int Dimensions, typename... ArgsT>
Expand Down
15 changes: 15 additions & 0 deletions sycl/include/sycl/khr/free_function_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,27 +153,42 @@ void launch_grouped(const queue &q, range<1> r, range<1> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
submit(q, ext::oneapi::experimental::empty_properties_t{},
nd_range<1>(r, size), k);
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}
template <typename KernelType>
void launch_grouped(const queue &q, range<2> r, range<2> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
submit(q, ext::oneapi::experimental::empty_properties_t{},
nd_range<2>(r, size), k);
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}
template <typename KernelType>
void launch_grouped(const queue &q, range<3> r, range<3> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
submit(q, ext::oneapi::experimental::empty_properties_t{},
nd_range<3>(r, size), k);
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}

template <typename... Args>
Expand Down
Loading
Loading