Skip to content

Commit cdf4060

Browse files
committed
Merge remote-tracking branch 'origin/sycl' into llvmspirv_pulldown
2 parents 1631cf7 + 6bcb265 commit cdf4060

File tree

21 files changed

+500
-122
lines changed

21 files changed

+500
-122
lines changed

devops/containers/ubuntu2404_base.Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ ENV DEBIAN_FRONTEND=noninteractive
44

55
USER root
66

7+
# Configure LLVM nightly repo
8+
RUN apt-get update -qq && apt-get install --no-install-recommends -yqq curl ca-certificates
9+
RUN curl -sSL https://apt.llvm.org/llvm-snapshot.gpg.key -o /etc/apt/trusted.gpg.d/apt.llvm.org.asc
10+
RUN echo 'deb http://apt.llvm.org/noble/ llvm-toolchain-noble main' > /etc/apt/sources.list.d/llvm.list
11+
712
# Install SYCL prerequisites
813
COPY scripts/install_build_tools.sh /install.sh
914
RUN /install.sh

sycl/doc/extensions/proposed/sycl_ext_intel_event_mode.asciidoc renamed to sycl/doc/extensions/experimental/sycl_ext_intel_event_mode.asciidoc

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,12 @@ This extension also depends on the following other SYCL extensions:
5252

5353
== Status
5454

55-
This is a proposed extension specification, intended to gather community
56-
feedback. Interfaces defined in this specification may not be implemented yet
57-
or may be in a preliminary state. The specification itself may also change in
58-
incompatible ways before it is finalized. *Shipping software products should
59-
not rely on APIs defined in this specification.*
55+
This is an experimental extension specification, intended to provide early
56+
access to features and gather community feedback. Interfaces defined in this
57+
specification are implemented in {dpcpp}, but they are not finalized and may
58+
change incompatibly in future versions of {dpcpp} without prior notice.
59+
*Shipping software products should not rely on APIs defined in this
60+
specification.*
6061

6162

6263
== Overview

sycl/source/detail/kernel_name_based_cache_t.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ struct FastKernelSubcacheT {
7474
struct KernelNameBasedCacheT {
7575
FastKernelSubcacheT FastKernelSubcache;
7676
std::optional<bool> UsesAssert;
77+
// Implicit local argument position is represented by an optional int, this
78+
// uses another optional on top of that to represent lazy initialization of
79+
// the cached value.
80+
std::optional<std::optional<int>> ImplicitLocalArgPos;
7781
};
7882

7983
} // namespace detail

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1841,12 +1841,24 @@ void ProgramManager::cacheKernelImplicitLocalArg(RTDeviceBinaryImage &Img) {
18411841
}
18421842
}
18431843

1844-
std::optional<int>
1845-
ProgramManager::kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const {
1846-
auto it = m_KernelImplicitLocalArgPos.find(KernelName);
1847-
if (it != m_KernelImplicitLocalArgPos.end())
1848-
return it->second;
1849-
return {};
1844+
std::optional<int> ProgramManager::kernelImplicitLocalArgPos(
1845+
KernelNameStrRefT KernelName,
1846+
KernelNameBasedCacheT *KernelNameBasedCachePtr) const {
1847+
auto getLocalArgPos = [&]() -> std::optional<int> {
1848+
auto it = m_KernelImplicitLocalArgPos.find(KernelName);
1849+
if (it != m_KernelImplicitLocalArgPos.end())
1850+
return it->second;
1851+
return {};
1852+
};
1853+
1854+
if (!KernelNameBasedCachePtr)
1855+
return getLocalArgPos();
1856+
std::optional<std::optional<int>> &ImplicitLocalArgPos =
1857+
KernelNameBasedCachePtr->ImplicitLocalArgPos;
1858+
if (!ImplicitLocalArgPos.has_value()) {
1859+
ImplicitLocalArgPos = getLocalArgPos();
1860+
}
1861+
return ImplicitLocalArgPos.value();
18501862
}
18511863

18521864
static bool isBfloat16DeviceLibImage(sycl_device_binary RawImg,

sycl/source/detail/program_manager/program_manager.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,8 +373,9 @@ class ProgramManager {
373373

374374
SanitizerType kernelUsesSanitizer() const { return m_SanitizerFoundInImage; }
375375

376-
std::optional<int>
377-
kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const;
376+
std::optional<int> kernelImplicitLocalArgPos(
377+
KernelNameStrRefT KernelName,
378+
KernelNameBasedCacheT *KernelNameBasedCachePtr) const;
378379

379380
std::set<RTDeviceBinaryImage *>
380381
getRawDeviceImages(const std::vector<kernel_id> &KernelIDs);

sycl/source/detail/scheduler/commands.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2389,8 +2389,9 @@ static ur_result_t SetKernelParamsAndLaunch(
23892389
const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
23902390
bool IsCooperative, bool KernelUsesClusterLaunch,
23912391
uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage,
2392-
KernelNameStrRefT KernelName, void *KernelFuncPtr = nullptr,
2393-
int KernelNumArgs = 0,
2392+
KernelNameStrRefT KernelName,
2393+
KernelNameBasedCacheT *KernelNameBasedCachePtr,
2394+
void *KernelFuncPtr = nullptr, int KernelNumArgs = 0,
23942395
detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr,
23952396
bool KernelHasSpecialCaptures = true) {
23962397
const AdapterPtr &Adapter = Queue.getAdapter();
@@ -2437,7 +2438,8 @@ static ur_result_t SetKernelParamsAndLaunch(
24372438
}
24382439

24392440
std::optional<int> ImplicitLocalArg =
2440-
ProgramManager::getInstance().kernelImplicitLocalArgPos(KernelName);
2441+
ProgramManager::getInstance().kernelImplicitLocalArgPos(
2442+
KernelName, KernelNameBasedCachePtr);
24412443
// Set the implicit local memory buffer to support
24422444
// get_work_group_scratch_memory. This is for backend not supporting
24432445
// CUDA-style local memory setting. Note that we may have -1 as a position,
@@ -2752,8 +2754,8 @@ void enqueueImpKernel(
27522754
*Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
27532755
OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
27542756
KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize,
2755-
BinImage, KernelName, KernelFuncPtr, KernelNumArgs,
2756-
KernelParamDescGetter, KernelHasSpecialCaptures);
2757+
BinImage, KernelName, KernelNameBasedCachePtr, KernelFuncPtr,
2758+
KernelNumArgs, KernelParamDescGetter, KernelHasSpecialCaptures);
27572759
}
27582760
if (UR_RESULT_SUCCESS != Error) {
27592761
// If we have got non-success error code, let's analyze it to emit nice

sycl/source/feature_test.hpp.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ inline namespace _V1 {
120120
// In progress yet
121121
#define SYCL_EXT_ONEAPI_ATOMIC16 0
122122
#define SYCL_KHR_DEFAULT_CONTEXT 1
123+
#define SYCL_EXT_INTEL_EVENT_MODE 1
123124

124125
#ifndef __has_include
125126
#define __has_include(x) 0
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include <sycl/detail/core.hpp>
5+
#include <sycl/usm.hpp>
6+
7+
#include <iostream>
8+
#include <vector>
9+
10+
std::vector<sycl::event> submit_dependencies(sycl::queue q1, sycl::queue q2,
11+
int *mem1, int *mem2) {
12+
int delay_ops = 1024 * 1024;
13+
auto delay = [=] {
14+
volatile int value = delay_ops;
15+
while (--value)
16+
;
17+
};
18+
19+
auto ev1 =
20+
q1.parallel_for(sycl::range<1>(1024), [=]([[maybe_unused]] auto u) {
21+
delay();
22+
mem1[u.get_id()] = 1;
23+
});
24+
auto ev2 =
25+
q2.parallel_for(sycl::range<1>(1024), [=]([[maybe_unused]] auto u) {
26+
delay();
27+
mem2[u.get_id()] = 2;
28+
});
29+
30+
return {ev1, ev2};
31+
}
32+
33+
void test_host_task() {
34+
sycl::context c1{};
35+
sycl::context c2{};
36+
37+
sycl::queue q1(c1, sycl::default_selector_v);
38+
sycl::queue q2(c2, sycl::default_selector_v);
39+
40+
auto mem1 = sycl::malloc_host<int>(1024, q1);
41+
auto mem2 = sycl::malloc_host<int>(1024, q2);
42+
43+
auto events = submit_dependencies(q1, q2, mem1, mem2);
44+
45+
q2.submit([&](sycl::handler &cgh) {
46+
cgh.depends_on(events[0]);
47+
cgh.depends_on(events[1]);
48+
cgh.host_task([=]() {
49+
for (int i = 0; i < 1024; i++) {
50+
assert(mem1[i] == 1);
51+
assert(mem2[i] == 2);
52+
}
53+
});
54+
});
55+
56+
q2.wait();
57+
58+
sycl::free(mem1, c1);
59+
sycl::free(mem2, c2);
60+
}
61+
62+
void test_kernel() {
63+
sycl::context c1{};
64+
sycl::context c2{};
65+
66+
sycl::queue q1(c1, sycl::default_selector_v);
67+
sycl::queue q2(c2, sycl::default_selector_v);
68+
69+
auto mem1 = sycl::malloc_device<int>(1024, q1);
70+
auto mem2 = sycl::malloc_device<int>(1024, q2);
71+
72+
auto events = submit_dependencies(q1, q2, mem1, mem2);
73+
74+
q1.submit([&](sycl::handler &cgh) {
75+
cgh.depends_on(events[0]);
76+
cgh.depends_on(events[1]);
77+
cgh.parallel_for(sycl::range<1>(1024),
78+
[=](auto item) { assert(mem1[item.get_id()] == 1); });
79+
});
80+
81+
q2.submit([&](sycl::handler &cgh) {
82+
cgh.depends_on(events[0]);
83+
cgh.depends_on(events[1]);
84+
cgh.parallel_for(sycl::range<1>(1024),
85+
[=](auto item) { assert(mem2[item.get_id()] == 2); });
86+
});
87+
88+
q1.wait();
89+
q2.wait();
90+
91+
sycl::free(mem1, c1);
92+
sycl::free(mem2, c2);
93+
}
94+
95+
int main() {
96+
test_host_task();
97+
test_kernel();
98+
99+
return 0;
100+
}

unified-runtime/source/adapters/level_zero/v2/memory.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ struct ur_mem_image_t : ur_object {
247247
ZeStruct<ze_image_desc_t> zeImageDesc;
248248
};
249249

250-
struct ur_mem_handle_t_ {
250+
struct ur_mem_handle_t_ : ur::handle_base<ur::level_zero::ddi_getter> {
251251
template <typename T, typename... Args>
252252
static ur_mem_handle_t_ *create(Args &&...args) {
253253
return new ur_mem_handle_t_(std::in_place_type<T>,
@@ -293,7 +293,8 @@ struct ur_mem_handle_t_ {
293293
private:
294294
template <typename T, typename... Args>
295295
ur_mem_handle_t_(std::in_place_type_t<T>, Args &&...args)
296-
: mem(std::in_place_type<T>, std::forward<Args>(args)...) {}
296+
: ur::handle_base<ur::level_zero::ddi_getter>(),
297+
mem(std::in_place_type<T>, std::forward<Args>(args)...) {}
297298

298299
std::variant<ur_usm_handle_t, ur_integrated_buffer_handle_t,
299300
ur_discrete_buffer_handle_t, ur_shared_buffer_handle_t,

unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1026,9 +1026,14 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBufferExp(
10261026
ur_event_handle_t executionEvent =
10271027
hCommandBuffer->getExecutionEventUnlocked();
10281028

1029+
if (executionEvent != nullptr) {
1030+
ZE2UR_CALL(zeEventHostSynchronize,
1031+
(executionEvent->getZeEvent(), UINT64_MAX));
1032+
}
1033+
10291034
UR_CALL(enqueueGenericCommandListsExp(
10301035
1, &commandBufferCommandList, phEvent, numEventsInWaitList,
1031-
phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, executionEvent));
1036+
phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, nullptr));
10321037
UR_CALL(hCommandBuffer->registerExecutionEventUnlocked(*phEvent));
10331038
if (internalEvent != nullptr) {
10341039
internalEvent->release();

unified-runtime/source/adapters/offload/enqueue.cpp

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,30 +20,43 @@
2020
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
2121
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
2222
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
23-
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
24-
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
23+
const size_t *pLocalWorkSize, uint32_t, const ur_kernel_launch_property_t *,
24+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
25+
ur_event_handle_t *phEvent) {
2526
// Ignore wait list for now
2627
(void)numEventsInWaitList;
2728
(void)phEventWaitList;
2829
//
2930

3031
(void)pGlobalWorkOffset;
31-
(void)pLocalWorkSize;
3232

33-
if (workDim == 1) {
34-
std::cerr
35-
<< "UR Offload adapter only supports 1d kernel launches at the moment";
36-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
33+
size_t GlobalSize[3] = {1, 1, 1};
34+
for (uint32_t I = 0; I < workDim; I++) {
35+
GlobalSize[I] = pGlobalWorkSize[I];
36+
}
37+
38+
// TODO: We default to 1, 1, 1 here. In future if pLocalWorkSize is not
39+
// specified, we should pick the "best" one
40+
size_t GroupSize[3] = {1, 1, 1};
41+
if (pLocalWorkSize) {
42+
for (uint32_t I = 0; I < workDim; I++) {
43+
GroupSize[I] = pLocalWorkSize[I];
44+
}
45+
}
46+
47+
if (GroupSize[0] > GlobalSize[0] || GroupSize[1] > GlobalSize[1] ||
48+
GroupSize[2] > GlobalSize[2]) {
49+
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
3750
}
3851

3952
ol_kernel_launch_size_args_t LaunchArgs;
4053
LaunchArgs.Dimensions = workDim;
41-
LaunchArgs.NumGroupsX = pGlobalWorkSize[0];
42-
LaunchArgs.NumGroupsY = 1;
43-
LaunchArgs.NumGroupsZ = 1;
44-
LaunchArgs.GroupSizeX = 1;
45-
LaunchArgs.GroupSizeY = 1;
46-
LaunchArgs.GroupSizeZ = 1;
54+
LaunchArgs.NumGroupsX = GlobalSize[0] / GroupSize[0];
55+
LaunchArgs.NumGroupsY = GlobalSize[1] / GroupSize[1];
56+
LaunchArgs.NumGroupsZ = GlobalSize[2] / GroupSize[2];
57+
LaunchArgs.GroupSizeX = GroupSize[0];
58+
LaunchArgs.GroupSizeY = GroupSize[1];
59+
LaunchArgs.GroupSizeZ = GroupSize[2];
4760
LaunchArgs.DynSharedMemory = 0;
4861

4962
ol_event_handle_t EventOut;

unified-runtime/source/adapters/offload/ur_interface_loader.cpp

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -381,25 +381,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
381381
return result;
382382
}
383383

384-
pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;
385384
pDdiTable->pfnTimestampRecordingExp = nullptr;
386385
pDdiTable->pfnNativeCommandExp = nullptr;
387386

388387
return UR_RESULT_SUCCESS;
389388
}
390389

391-
UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
392-
ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) {
393-
auto result = validateProcInputs(version, pDdiTable);
394-
if (UR_RESULT_SUCCESS != result) {
395-
return result;
396-
}
397-
398-
pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr;
399-
400-
return UR_RESULT_SUCCESS;
401-
}
402-
403390
UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
404391
ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) {
405392
auto result = validateProcInputs(version, pDdiTable);
@@ -424,7 +411,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version,
424411
urGetEnqueueExpProcAddrTable(version, &pDdiTable->EnqueueExp);
425412
urGetEventProcAddrTable(version, &pDdiTable->Event);
426413
urGetKernelProcAddrTable(version, &pDdiTable->Kernel);
427-
urGetKernelExpProcAddrTable(version, &pDdiTable->KernelExp);
428414
urGetMemProcAddrTable(version, &pDdiTable->Mem);
429415
urGetPhysicalMemProcAddrTable(version, &pDdiTable->PhysicalMem);
430416
urGetPlatformProcAddrTable(version, &pDdiTable->Platform);

unified-runtime/source/adapters/opencl/event.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
149149

150150
UR_APIEXPORT ur_result_t UR_APICALL
151151
urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
152-
std::vector<cl_event> CLEvents(numEvents);
152+
ur_context_handle_t hContext = phEventWaitList[0]->Context;
153+
std::vector<cl_event> CLEvents;
154+
CLEvents.reserve(numEvents);
155+
156+
// clWaitForEvents can only be called on events from the same context.
157+
// If the events are from different contexts, we need to wait for each
158+
// set of events separately.
153159
for (uint32_t i = 0; i < numEvents; i++) {
154-
CLEvents[i] = phEventWaitList[i]->CLEvent;
160+
if (phEventWaitList[i]->Context != hContext) {
161+
CL_RETURN_ON_FAILURE(clWaitForEvents(CLEvents.size(), CLEvents.data()));
162+
CLEvents.clear();
163+
}
164+
165+
CLEvents.push_back(phEventWaitList[i]->CLEvent);
166+
hContext = phEventWaitList[i]->Context;
167+
}
168+
if (CLEvents.size()) {
169+
CL_RETURN_ON_FAILURE(clWaitForEvents(CLEvents.size(), CLEvents.data()));
155170
}
156-
cl_int RetErr = clWaitForEvents(numEvents, CLEvents.data());
157-
CL_RETURN_ON_FAILURE(RetErr);
158171
return UR_RESULT_SUCCESS;
159172
}
160173

0 commit comments

Comments
 (0)