Skip to content

Commit 88472f3

Browse files
EwanCkbenzie
andauthored
[SYCL][Graph] Fix CUDA/HIP local mem argument update bug (#16025)
Tests UR PR oneapi-src/unified-runtime#2298 with additional SYCL-Graph local memory argument E2E tests. PR also sets the `pnext` and `snext` members of `ur_exp_command_buffer_update_kernel_launch_desc_t ` which were missing when calling into UR. --------- Co-authored-by: Kenneth Benzie (Benie) <[email protected]>
1 parent 130a901 commit 88472f3

23 files changed

+656
-27
lines changed

sycl/cmake/modules/FetchUnifiedRuntime.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
116116
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
117117
endfunction()
118118

119-
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime")
119+
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
120120
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)
121121

122122
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
+7-7
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
# commit 7a38cc3e21d83940622d63b0b902bf68f9539f6f
2-
# Merge: 2dac0daaf327 252b3822f0e8
3-
# Author: Martin Grant <martin.morrisongrant@codeplay.com>
4-
# Date: Mon Dec 2 15:41:23 2024 +0000
5-
# Merge pull request #2395 from pbalcer/fix-event-pooling
6-
# fix event caching
7-
set(UNIFIED_RUNTIME_TAG 7a38cc3e21d83940622d63b0b902bf68f9539f6f)
1+
# commit 2bea25d7d5404ce82f36caf91c359f65b25187d7
2+
# Merge: 0b5d8f9e e578228a
3+
# Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
4+
# Date: Tue Dec 3 12:41:16 2024 +0000
5+
# Merge pull request #2298 from Bensuo/ewan/cuda_update_local_size
6+
# Improve CUDA/HIP local argument handling
7+
set(UNIFIED_RUNTIME_TAG 2bea25d7d5404ce82f36caf91c359f65b25187d7)

sycl/source/detail/graph_impl.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -1512,7 +1512,10 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
15121512
PtrDescs.reserve(MaskedArgs.size());
15131513
ValueDescs.reserve(MaskedArgs.size());
15141514

1515-
ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc;
1515+
ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc{};
1516+
UpdateDesc.stype =
1517+
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC;
1518+
UpdateDesc.pNext = nullptr;
15161519

15171520
// Collect arg descriptors and fill kernel launch descriptor
15181521
using sycl::detail::kernel_param_kind_t;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
#define GRAPH_E2E_EXPLICIT
9+
10+
#include "../Inputs/local_accessor_multiple_accessors.cpp"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
#define GRAPH_E2E_EXPLICIT
9+
10+
#include "../Inputs/local_accessor_multiple_nodes.cpp"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
// REQUIRES: ocloc && (opencl || level_zero)
9+
10+
#define GRAPH_E2E_EXPLICIT
11+
12+
#include "../Inputs/opencl_local_acc.cpp"

sycl/test-e2e/Graph/Inputs/local_accessor.cpp

+7-9
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,18 @@ int main() {
1010

1111
const size_t LocalSize = 128;
1212

13-
std::vector<T> DataA(Size), DataB(Size), DataC(Size);
13+
std::vector<T> HostData(Size);
1414

15-
std::iota(DataA.begin(), DataA.end(), 10);
16-
17-
std::vector<T> ReferenceA(DataA);
15+
std::iota(HostData.begin(), HostData.end(), 10);
1816

1917
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
2018

2119
T *PtrA = malloc_device<T>(Size, Queue);
2220

23-
Queue.copy(DataA.data(), PtrA, Size);
21+
Queue.copy(HostData.data(), PtrA, Size);
2422
Queue.wait_and_throw();
2523

26-
auto node = add_node(Graph, Queue, [&](handler &CGH) {
24+
auto Node = add_node(Graph, Queue, [&](handler &CGH) {
2725
local_accessor<T, 1> LocalMem(LocalSize, CGH);
2826

2927
CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
@@ -40,14 +38,14 @@ int main() {
4038

4139
Queue.wait_and_throw();
4240

43-
Queue.copy(PtrA, DataA.data(), Size);
41+
Queue.copy(PtrA, HostData.data(), Size);
4442
Queue.wait_and_throw();
4543

4644
free(PtrA, Queue);
4745

4846
for (size_t i = 0; i < Size; i++) {
49-
T Ref = 10 + i + (i * 2);
50-
check_value(i, Ref, ReferenceA[i], "PtrA");
47+
T Ref = 10 + i + (Iterations * (i * 2));
48+
assert(check_value(i, Ref, HostData[i], "PtrA"));
5149
}
5250

5351
return 0;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Tests adding of nodes with more than one local accessor,
2+
// and submitting the graph.
3+
4+
#include "../graph_common.hpp"
5+
6+
int main() {
7+
queue Queue{};
8+
9+
using T = int;
10+
11+
const size_t LocalSize = 128;
12+
13+
std::vector<T> HostData(Size);
14+
15+
std::iota(HostData.begin(), HostData.end(), 10);
16+
17+
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
18+
19+
T *PtrIn = malloc_device<T>(Size, Queue);
20+
T *PtrOut = malloc_device<T>(Size, Queue);
21+
22+
Queue.memset(PtrOut, 0, Size * sizeof(T));
23+
Queue.copy(HostData.data(), PtrIn, Size);
24+
Queue.wait_and_throw();
25+
26+
auto Node = add_node(Graph, Queue, [&](handler &CGH) {
27+
local_accessor<T, 1> LocalMemA(LocalSize, CGH);
28+
local_accessor<T, 1> LocalMemB(LocalSize, CGH);
29+
30+
CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
31+
auto LocalID = Item.get_local_linear_id();
32+
auto GlobalID = Item.get_global_linear_id();
33+
LocalMemA[LocalID] = GlobalID;
34+
LocalMemB[LocalID] = PtrIn[GlobalID];
35+
PtrOut[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID];
36+
});
37+
});
38+
39+
auto GraphExec = Graph.finalize();
40+
41+
for (unsigned n = 0; n < Iterations; n++) {
42+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
43+
}
44+
45+
Queue.wait_and_throw();
46+
47+
Queue.copy(PtrOut, HostData.data(), Size);
48+
Queue.wait_and_throw();
49+
50+
free(PtrIn, Queue);
51+
free(PtrOut, Queue);
52+
53+
for (size_t i = 0; i < Size; i++) {
54+
T Ref = 0;
55+
for (size_t n = 0; n < Iterations; n++) {
56+
Ref += (i * (10 + i));
57+
}
58+
assert(check_value(i, Ref, HostData[i], "PtrOut"));
59+
}
60+
61+
return 0;
62+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Test creating a graph where more than one nodes uses local accessors,
2+
// and submits of the graph.
3+
4+
#include "../graph_common.hpp"
5+
6+
int main() {
7+
queue Queue{};
8+
9+
using T = int;
10+
11+
const size_t LocalSize = 128;
12+
13+
std::vector<T> HostData(Size);
14+
15+
std::iota(HostData.begin(), HostData.end(), 10);
16+
17+
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
18+
19+
T *PtrA = malloc_device<T>(Size, Queue);
20+
21+
Queue.copy(HostData.data(), PtrA, Size);
22+
Queue.wait_and_throw();
23+
24+
auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
25+
local_accessor<T, 1> LocalMem(LocalSize, CGH);
26+
27+
CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
28+
LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2;
29+
PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()];
30+
});
31+
});
32+
33+
auto NodeB = add_node(
34+
Graph, Queue,
35+
[&](handler &CGH) {
36+
local_accessor<T, 1> LocalMem(LocalSize, CGH);
37+
depends_on_helper(CGH, NodeA);
38+
39+
CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
40+
LocalMem[Item.get_local_linear_id()] = 3;
41+
PtrA[Item.get_global_linear_id()] *=
42+
LocalMem[Item.get_local_linear_id()];
43+
});
44+
},
45+
NodeA);
46+
47+
auto GraphExec = Graph.finalize();
48+
49+
for (unsigned n = 0; n < Iterations; n++) {
50+
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
51+
}
52+
53+
Queue.wait_and_throw();
54+
55+
Queue.copy(PtrA, HostData.data(), Size);
56+
Queue.wait_and_throw();
57+
58+
free(PtrA, Queue);
59+
60+
for (size_t i = 0; i < Size; i++) {
61+
T Ref = 10 + i;
62+
63+
for (size_t n = 0; n < Iterations; n++) {
64+
Ref += i * 2;
65+
Ref *= 3;
66+
}
67+
assert(check_value(i, Ref, HostData[i], "PtrA"));
68+
}
69+
70+
return 0;
71+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// Tests using an OpenCL-C defined kernel with multiple local accessors
2+
3+
#include "../graph_common.hpp"
4+
5+
using source_kb = sycl::kernel_bundle<sycl::bundle_state::ext_oneapi_source>;
6+
using exe_kb = sycl::kernel_bundle<sycl::bundle_state::executable>;
7+
8+
auto constexpr LocalAccCLSource = R"===(
9+
kernel void test_la(__global int *out, __local int* local_ptr1,
10+
__local int2* local_ptr2, int n) {
11+
__local int4 local_data[1];
12+
13+
size_t gid = get_global_id(0);
14+
size_t lid = get_local_id(0);
15+
size_t wg_size = get_num_groups(0);
16+
17+
local_ptr1[lid] = lid;
18+
local_ptr2[lid].x = n;
19+
local_ptr2[lid].y = wg_size;
20+
21+
if (lid == 0) {
22+
local_data[lid] = (int4)(0xA, 0xB, 0xC, 0xD);
23+
}
24+
25+
barrier(CLK_LOCAL_MEM_FENCE);
26+
27+
int acc = local_data[0].x + local_data[0].y + local_data[0].z +
28+
local_data[0].w;
29+
out[gid] = (local_ptr1[lid] * local_ptr2[lid].x) +
30+
(local_ptr2[lid].y * acc);
31+
}
32+
)===";
33+
34+
int main() {
35+
queue Queue;
36+
37+
source_kb kbSrc = exp_ext::create_kernel_bundle_from_source(
38+
Queue.get_context(), exp_ext::source_language::opencl, LocalAccCLSource);
39+
exe_kb kbExe1 = exp_ext::build(kbSrc);
40+
sycl::kernel test_kernel = kbExe1.ext_oneapi_get_kernel("test_la");
41+
42+
exp_ext::command_graph Graph{Queue};
43+
44+
int32_t *Ptr = malloc_device<int32_t>(Size, Queue);
45+
46+
int32_t N = 42;
47+
constexpr size_t LocalSize = 256;
48+
auto Node = add_node(Graph, Queue, [&](handler &cgh) {
49+
local_accessor<int32_t, 1> acc_local1(LocalSize, cgh);
50+
local_accessor<sycl::int2, 1> acc_local2(LocalSize, cgh);
51+
52+
cgh.set_arg(0, Ptr);
53+
cgh.set_arg(1, acc_local1);
54+
cgh.set_arg(2, acc_local2);
55+
cgh.set_arg(3, N);
56+
57+
cgh.parallel_for(nd_range<1>(Size, LocalSize), test_kernel);
58+
});
59+
60+
auto ExecGraph = Graph.finalize();
61+
Queue.ext_oneapi_graph(ExecGraph).wait();
62+
63+
std::vector<int32_t> HostData(Size);
64+
Queue.copy(Ptr, HostData.data(), Size).wait();
65+
66+
constexpr int32_t Acc = 0xA + 0xB + 0xC + 0xD;
67+
constexpr int32_t WorkGroups = Size / LocalSize;
68+
constexpr int32_t Tmp = Acc * WorkGroups;
69+
for (size_t i = 0; i < Size; i++) {
70+
int32_t local_id = i % LocalSize;
71+
int32_t Ref = (local_id * N) + Tmp;
72+
assert(HostData[i] == Ref);
73+
}
74+
75+
sycl::free(Ptr, Queue);
76+
77+
return 0;
78+
}

sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp

+7-9
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@ int main() {
2828
std::iota(DataA.begin(), DataA.end(), 10);
2929
std::iota(DataB.begin(), DataB.end(), 10);
3030

31-
std::vector<T> ReferenceA(DataA), ReferenceB(DataB);
32-
3331
exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
3432

3533
T *PtrA = malloc_device<T>(Size, Queue);
@@ -59,10 +57,10 @@ int main() {
5957
Queue.wait_and_throw();
6058

6159
for (size_t i = 0; i < Size; i++) {
62-
T RefA = 10 + i + (i * 2) + LocalSize / 2;
60+
T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
6361
T RefB = 10 + i;
64-
check_value(i, RefA, ReferenceA[i], "PtrA");
65-
check_value(i, RefB, ReferenceB[i], "PtrB");
62+
assert(check_value(i, RefA, DataA[i], "PtrA"));
63+
assert(check_value(i, RefB, DataB[i], "PtrB"));
6664
}
6765

6866
// Update GraphExecA using whole graph update
@@ -81,10 +79,10 @@ int main() {
8179
Queue.wait_and_throw();
8280

8381
for (size_t i = 0; i < Size; i++) {
84-
T RefA = 10 + i + (i * 2) + LocalSize / 2;
85-
T RefB = 10 + i + (i * 2) + LocalSize;
86-
check_value(i, RefA, ReferenceA[i], "PtrA");
87-
check_value(i, RefB, ReferenceB[i], "PtrB");
82+
T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
83+
T RefB = 10 + i + Iterations * ((i * 2) + LocalSize);
84+
assert(check_value(i, RefA, DataA[i], "PtrA"));
85+
assert(check_value(i, RefB, DataB[i], "PtrB"));
8886
}
8987

9088
free(PtrA, Queue);

0 commit comments

Comments
 (0)