Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
cc71f38
[SYCL] Postpone creation of HostKernel copy
Alexandr-Konovalov Sep 29, 2025
c917410
Fix code formatting.
Alexandr-Konovalov Sep 29, 2025
941a5ff
Fix code formatting.
Alexandr-Konovalov Sep 29, 2025
6258fed
Make HostKernelRef::InstantiateKernelOnHost() empty.
Alexandr-Konovalov Sep 30, 2025
5e89d2f
Fix code formatting.
Alexandr-Konovalov Sep 30, 2025
a90a0ea
Enable move semantic for kernels.
Alexandr-Konovalov Oct 1, 2025
f65d5ba
Fix code formatting.
Alexandr-Konovalov Oct 1, 2025
91deab6
Return unique_ptr from takeOrCopyOwnership().
Alexandr-Konovalov Oct 1, 2025
cf65f74
Add unit test.
Alexandr-Konovalov Oct 1, 2025
4c85aa5
Fix code formatting.
Alexandr-Konovalov Oct 1, 2025
63b7572
Update sycl/include/sycl/detail/cg_types.hpp
Alexandr-Konovalov Oct 1, 2025
855d6a2
Remove redundant cast.
Alexandr-Konovalov Oct 1, 2025
cb688cd
Update sycl/include/sycl/detail/cg_types.hpp
Alexandr-Konovalov Oct 1, 2025
b81d48b
Use C++17-style metafunctions.
Alexandr-Konovalov Oct 1, 2025
a64c17c
Addressing code review.
Alexandr-Konovalov Oct 2, 2025
8de865a
Fix code formatting.
Alexandr-Konovalov Oct 2, 2025
79d19a9
Delete assignment operator and add layout test for HostKernelRefBase.
Alexandr-Konovalov Oct 2, 2025
a091ac4
Explicitely delete move ctor for HostKernelRef.
Alexandr-Konovalov Oct 2, 2025
5de3ae6
Update sycl_symbols_(linux|windows).dump
Alexandr-Konovalov Oct 2, 2025
06e3a92
Explicitely delete copy ctor from HostKernelRefBase.
Alexandr-Konovalov Oct 2, 2025
6d38d49
Merge branch 'sycl' into Alexandr-Konovalov/vptr_HostKernelRefBase
Alexandr-Konovalov Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion sycl/include/sycl/detail/cg_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,8 +235,66 @@ class HostKernel : public HostKernelBase {
#endif
};

// the class keeps reference to a lambda allocated externally on stack
class HostKernelRefBase : public HostKernelBase {
public:
HostKernelRefBase() = default;
HostKernelRefBase(const HostKernelRefBase &) = delete;
HostKernelRefBase &operator=(const HostKernelRefBase &) = delete;

virtual std::unique_ptr<HostKernelBase> takeOrCopyOwnership() const = 0;
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
// The kernels that are passed via HostKernelRefBase are instantiated along
// ctor call with GetInstantiateKernelOnHostPtr().
void InstantiateKernelOnHost() override {}
#endif
};

// Primary template for movable objects.
template <class KernelType, class KernelTypeUniversalRef, class KernelArgType,
int Dims>
class HostKernelRef : public HostKernelRefBase {
KernelType &&MKernel;

public:
HostKernelRef(KernelType &&Kernel) : MKernel(std::move(Kernel)) {}
HostKernelRef(const KernelType &Kernel) = delete;

virtual char *getPtr() override { return reinterpret_cast<char *>(&MKernel); }
virtual std::unique_ptr<HostKernelBase> takeOrCopyOwnership() const override {
std::unique_ptr<HostKernelBase> Kernel;
Kernel.reset(
new HostKernel<KernelType, KernelArgType, Dims>(std::move(MKernel)));
return Kernel;
}

~HostKernelRef() noexcept override = default;
};

// Specialization for copyable objects.
template <class KernelType, class KernelTypeUniversalRef, class KernelArgType,
int Dims>
class HostKernelRef<KernelType, KernelTypeUniversalRef &, KernelArgType, Dims>
: public HostKernelRefBase {
const KernelType &MKernel;

public:
HostKernelRef(const KernelType &Kernel) : MKernel(Kernel) {}

virtual char *getPtr() override {
return const_cast<char *>(reinterpret_cast<const char *>(&MKernel));
}
virtual std::unique_ptr<HostKernelBase> takeOrCopyOwnership() const override {
std::unique_ptr<HostKernelBase> Kernel;
Kernel.reset(new HostKernel<KernelType, KernelArgType, Dims>(MKernel));
return Kernel;
}

~HostKernelRef() noexcept override = default;
};

// This function is needed for host-side compilation to keep kernels
// instantitated. This is important for debuggers to be able to associate
// instantiated. This is important for debuggers to be able to associate
// kernel code instructions with source code lines.
template <class KernelType, class KernelArgType, int Dims>
constexpr void *GetInstantiateKernelOnHostPtr() {
Expand Down
38 changes: 21 additions & 17 deletions sycl/include/sycl/khr/free_function_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,44 +149,48 @@ void launch_grouped(handler &h, range<3> r, range<3> size,
}

template <typename KernelType>
void launch_grouped(const queue &q, range<1> r, range<1> size,
const KernelType &k,
constexpr bool enable_kernel_function_overload =
!std::is_same_v<typename std::decay_t<KernelType>, sycl::kernel>;

template <typename KernelType, typename = typename std::enable_if_t<
enable_kernel_function_overload<KernelType>>>
void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
detail::submit_kernel_direct(q,
ext::oneapi::experimental::empty_properties_t{},
nd_range<1>(r, size), k);
detail::submit_kernel_direct(
q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size),
std::forward<KernelType>(k));
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}
template <typename KernelType>
void launch_grouped(const queue &q, range<2> r, range<2> size,
const KernelType &k,
template <typename KernelType, typename = typename std::enable_if_t<
enable_kernel_function_overload<KernelType>>>
void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
detail::submit_kernel_direct(q,
ext::oneapi::experimental::empty_properties_t{},
nd_range<2>(r, size), k);
detail::submit_kernel_direct(
q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size),
std::forward<KernelType>(k));
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}
template <typename KernelType>
void launch_grouped(const queue &q, range<3> r, range<3> size,
const KernelType &k,
template <typename KernelType, typename = typename std::enable_if_t<
enable_kernel_function_overload<KernelType>>>
void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
detail::submit_kernel_direct(q,
ext::oneapi::experimental::empty_properties_t{},
nd_range<3>(r, size), k);
detail::submit_kernel_direct(
q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size),
std::forward<KernelType>(k));
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
Expand Down
29 changes: 20 additions & 9 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,14 @@ auto get_native(const SyclObjectT &Obj)
template <int Dims>
event __SYCL_EXPORT submit_kernel_direct_with_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template <int Dims>
void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

Expand Down Expand Up @@ -157,10 +157,10 @@ class __SYCL_EXPORT SubmissionInfo {
} // namespace v1

template <typename KernelName = detail::auto_name, bool EventNeeded = false,
typename PropertiesT, typename KernelType, int Dims>
typename PropertiesT, typename KernelTypeUniversalRef, int Dims>
auto submit_kernel_direct(
const queue &Queue, PropertiesT Props, const nd_range<Dims> &Range,
const KernelType &KernelFunc,
KernelTypeUniversalRef &&KernelFunc,
const detail::code_location &CodeLoc = detail::code_location::current()) {
// TODO Properties not supported yet
(void)Props;
Expand All @@ -170,6 +170,9 @@ auto submit_kernel_direct(
"Setting properties not supported yet for no-CGH kernel submit.");
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);

using KernelType =
std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;

using NameT =
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
using LambdaArgType =
Expand All @@ -180,15 +183,23 @@ auto submit_kernel_direct(
"must be either sycl::nd_item or be convertible from sycl::nd_item");
using TransformedArgType = sycl::nd_item<Dims>;

std::shared_ptr<detail::HostKernelBase> HostKernel = std::make_shared<
detail::HostKernel<KernelType, TransformedArgType, Dims>>(KernelFunc);
detail::KernelWrapper<detail::WrapAs::parallel_for, NameT, KernelType,
TransformedArgType, PropertiesT>::wrap(KernelFunc);

HostKernelRef<KernelType, KernelTypeUniversalRef, TransformedArgType, Dims>
HostKernel(std::forward<KernelTypeUniversalRef>(KernelFunc));

// Instantiating the kernel on the host improves debugging.
// Passing this pointer to another translation unit prevents optimization.
#ifndef NDEBUG
// TODO: call library to prevent dropping call due to optimization
(void)
detail::GetInstantiateKernelOnHostPtr<KernelType, LambdaArgType, Dims>();
#endif

detail::DeviceKernelInfo *DeviceKernelInfoPtr =
&detail::getDeviceKernelInfo<NameT>();

detail::KernelWrapper<detail::WrapAs::parallel_for, NameT, KernelType,
TransformedArgType, PropertiesT>::wrap(KernelFunc);

if constexpr (EventNeeded) {
return submit_kernel_direct_with_event_impl(
Queue, Range, HostKernel, DeviceKernelInfoPtr,
Expand Down
10 changes: 6 additions & 4 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,15 +421,17 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
}

detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
const NDRDescT &NDRDesc,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {

KernelData KData;

std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
HostKernel.takeOrCopyOwnership();

KData.setDeviceKernelInfoPtr(DeviceKernelInfo);
KData.setKernelFunc(HostKernel->getPtr());
KData.setKernelFunc(HostKernelPtr->getPtr());
KData.setNDRDesc(NDRDesc);

auto SubmitKernelFunc =
Expand All @@ -441,7 +443,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
KData.extractArgsAndReqsFromLambda();

CommandGroup.reset(new detail::CGExecKernel(
KData.getNDRDesc(), HostKernel,
KData.getNDRDesc(), std::move(HostKernelPtr),
nullptr, // Kernel
nullptr, // KernelBundle
std::move(CGData), std::move(KData).getArgs(),
Expand Down
9 changes: 3 additions & 6 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,8 +361,7 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {

template <int Dims>
event submit_kernel_direct_with_event(
const nd_range<Dims> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
const nd_range<Dims> &Range, detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
detail::EventImplPtr EventImpl =
Expand All @@ -373,8 +372,7 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {

template <int Dims>
void submit_kernel_direct_without_event(
const nd_range<Dims> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
const nd_range<Dims> &Range, detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo,
Expand Down Expand Up @@ -905,8 +903,7 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
///
/// \return a SYCL event representing submitted command group or nullptr.
detail::EventImplPtr submit_kernel_direct_impl(
const NDRDescT &NDRDesc,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

Expand Down
16 changes: 8 additions & 8 deletions sycl/source/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ const property_list &queue::getPropList() const { return impl->getPropList(); }
template <int Dims>
event submit_kernel_direct_with_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
return getSyclObjImpl(Queue)->submit_kernel_direct_with_event(
Expand All @@ -483,26 +483,26 @@ event submit_kernel_direct_with_event_impl(

template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>(
const queue &Queue, const nd_range<1> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>(
const queue &Queue, const nd_range<2> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>(
const queue &Queue, const nd_range<3> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template <int Dims>
void submit_kernel_direct_without_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
getSyclObjImpl(Queue)->submit_kernel_direct_without_event(
Expand All @@ -511,19 +511,19 @@ void submit_kernel_direct_without_event_impl(

template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>(
const queue &Queue, const nd_range<1> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>(
const queue &Queue, const nd_range<2> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>(
const queue &Queue, const nd_range<3> &Range,
std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

Expand Down
15 changes: 15 additions & 0 deletions sycl/test/abi/layout_host_kernel_ref.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// RUN: %clangxx -fsycl -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s
// REQUIRES: linux
// UNSUPPORTED: libcxx

// clang-format off

#include <sycl/detail/cg_types.hpp>

void foo(sycl::detail::HostKernelRefBase *) {}

// CHECK: 0 | class sycl::detail::HostKernelRefBase
// CHECK-NEXT: 0 | class sycl::detail::HostKernelBase (primary base)
// CHECK-NEXT: 0 | (HostKernelBase vtable pointer)
// CHECK-NEXT: | [sizeof=8, dsize=8, align=8,
// CHECK-NEXT: | nvsize=8, nvalign=8]
12 changes: 6 additions & 6 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -2985,12 +2985,12 @@ _ZN4sycl3_V121__isgreaterequal_implEdd
_ZN4sycl3_V121__isgreaterequal_implEff
_ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE
_ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE
_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSB_16DeviceKernelInfoERKNSB_13code_locationEb
_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSB_16DeviceKernelInfoERKNSB_13code_locationEb
_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSB_16DeviceKernelInfoERKNSB_13code_locationEb
_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb
_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb
_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERSt10shared_ptrINS0_6detail14HostKernelBaseEEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb
_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb
_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb
_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb
_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb
_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb
_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb
_ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv
_ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE
_ZN4sycl3_V13ext5intel12experimental9pipe_base18get_pipe_name_implEPKv
Expand Down
Loading