Skip to content

Commit

Permalink
[SYCL][Graph] Fix CUDA local mem argument update bug
Browse files Browse the repository at this point in the history
  • Loading branch information
EwanC committed Nov 11, 2024
1 parent 6d1f4fc commit 731cdc0
Show file tree
Hide file tree
Showing 14 changed files with 338 additions and 21 deletions.
2 changes: 1 addition & 1 deletion sycl/cmake/modules/FetchUnifiedRuntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git")
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)

set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
Expand Down
2 changes: 1 addition & 1 deletion sycl/cmake/modules/UnifiedRuntimeTag.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
# Date: Mon Nov 11 13:35:24 2024 +0000
# Merge pull request #2112 from martygrant/martin/context-cts-spec-gap
# Improvements to align CTS and Spec for Context
set(UNIFIED_RUNTIME_TAG 2eae687a4cf24ba02ee8e9ebb9552c1d392972ee)
set(UNIFIED_RUNTIME_TAG "ewan/cuda_update_local_size")
5 changes: 4 additions & 1 deletion sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1510,7 +1510,10 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
PtrDescs.reserve(MaskedArgs.size());
ValueDescs.reserve(MaskedArgs.size());

ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc;
ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc{};
UpdateDesc.stype =
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC;
UpdateDesc.pNext = nullptr;

// Collect arg descriptors and fill kernel launch descriptor
using sycl::detail::kernel_param_kind_t;
Expand Down
10 changes: 10 additions & 0 deletions sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

#define GRAPH_E2E_EXPLICIT

#include "../Inputs/local_accessor_multiple_accessors.cpp"
10 changes: 10 additions & 0 deletions sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}

#define GRAPH_E2E_EXPLICIT

#include "../Inputs/local_accessor_multiple_nodes.cpp"
16 changes: 7 additions & 9 deletions sycl/test-e2e/Graph/Inputs/local_accessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,18 @@ int main() {

const size_t LocalSize = 128;

std::vector<T> DataA(Size), DataB(Size), DataC(Size);
std::vector<T> HostData(Size);

std::iota(DataA.begin(), DataA.end(), 10);

std::vector<T> ReferenceA(DataA);
std::iota(HostData.begin(), HostData.end(), 10);

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);

Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(HostData.data(), PtrA, Size);
Queue.wait_and_throw();

auto node = add_node(Graph, Queue, [&](handler &CGH) {
auto Node = add_node(Graph, Queue, [&](handler &CGH) {
local_accessor<T, 1> LocalMem(LocalSize, CGH);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
Expand All @@ -40,14 +38,14 @@ int main() {

Queue.wait_and_throw();

Queue.copy(PtrA, DataA.data(), Size);
Queue.copy(PtrA, HostData.data(), Size);
Queue.wait_and_throw();

free(PtrA, Queue);

for (size_t i = 0; i < Size; i++) {
T Ref = 10 + i + (i * 2);
check_value(i, Ref, ReferenceA[i], "PtrA");
T Ref = 10 + i + (Iterations * (i * 2));
assert(check_value(i, Ref, HostData[i], "PtrA"));
}

return 0;
Expand Down
62 changes: 62 additions & 0 deletions sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Tests adding of nodes with more than one local accessor,
// and submitting the graph.

#include "../graph_common.hpp"

int main() {
queue Queue{};

using T = int;

const size_t LocalSize = 128;

std::vector<T> HostData(Size);

std::iota(HostData.begin(), HostData.end(), 10);

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

T *PtrIn = malloc_device<T>(Size, Queue);
T *PtrOut = malloc_device<T>(Size, Queue);

Queue.memset(PtrOut, 0, Size * sizeof(T));
Queue.copy(HostData.data(), PtrIn, Size);
Queue.wait_and_throw();

auto Node = add_node(Graph, Queue, [&](handler &CGH) {
local_accessor<T, 1> LocalMemA(LocalSize, CGH);
local_accessor<T, 1> LocalMemB(LocalSize, CGH);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
auto LocalID = Item.get_local_linear_id();
auto GlobalID = Item.get_global_linear_id();
LocalMemA[LocalID] = GlobalID;
LocalMemB[LocalID] = PtrIn[GlobalID];
PtrOut[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID];
});
});

auto GraphExec = Graph.finalize();

for (unsigned n = 0; n < Iterations; n++) {
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
}

Queue.wait_and_throw();

Queue.copy(PtrOut, HostData.data(), Size);
Queue.wait_and_throw();

free(PtrIn, Queue);
free(PtrOut, Queue);

for (size_t i = 0; i < Size; i++) {
T Ref = 0;
for (size_t n = 0; n < Iterations; n++) {
Ref += (i * (10 + i));
}
assert(check_value(i, Ref, HostData[i], "PtrOut"));
}

return 0;
}
71 changes: 71 additions & 0 deletions sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Test creating a graph where more than one nodes uses local accessors,
// and submits of the graph.

#include "../graph_common.hpp"

int main() {
queue Queue{};

using T = int;

const size_t LocalSize = 128;

std::vector<T> HostData(Size);

std::iota(HostData.begin(), HostData.end(), 10);

exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);

Queue.copy(HostData.data(), PtrA, Size);
Queue.wait_and_throw();

auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
local_accessor<T, 1> LocalMem(LocalSize, CGH);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2;
PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()];
});
});

auto NodeB = add_node(
Graph, Queue,
[&](handler &CGH) {
local_accessor<T, 1> LocalMem(LocalSize, CGH);
depends_on_helper(CGH, NodeA);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
LocalMem[Item.get_local_linear_id()] = 3;
PtrA[Item.get_global_linear_id()] *=
LocalMem[Item.get_local_linear_id()];
});
},
NodeA);

auto GraphExec = Graph.finalize();

for (unsigned n = 0; n < Iterations; n++) {
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
}

Queue.wait_and_throw();

Queue.copy(PtrA, HostData.data(), Size);
Queue.wait_and_throw();

free(PtrA, Queue);

for (size_t i = 0; i < Size; i++) {
T Ref = 10 + i;

for (size_t n = 0; n < Iterations; n++) {
Ref += i * 2;
Ref *= 3;
}
assert(check_value(i, Ref, HostData[i], "PtrA"));
}

return 0;
}
16 changes: 7 additions & 9 deletions sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ int main() {
std::iota(DataA.begin(), DataA.end(), 10);
std::iota(DataB.begin(), DataB.end(), 10);

std::vector<T> ReferenceA(DataA), ReferenceB(DataB);

exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);
Expand Down Expand Up @@ -59,10 +57,10 @@ int main() {
Queue.wait_and_throw();

for (size_t i = 0; i < Size; i++) {
T RefA = 10 + i + (i * 2) + LocalSize / 2;
T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
T RefB = 10 + i;
check_value(i, RefA, ReferenceA[i], "PtrA");
check_value(i, RefB, ReferenceB[i], "PtrB");
assert(check_value(i, RefA, DataA[i], "PtrA"));
assert(check_value(i, RefB, DataB[i], "PtrB"));
}

// Update GraphExecA using whole graph update
Expand All @@ -81,10 +79,10 @@ int main() {
Queue.wait_and_throw();

for (size_t i = 0; i < Size; i++) {
T RefA = 10 + i + (i * 2) + LocalSize / 2;
T RefB = 10 + i + (i * 2) + LocalSize;
check_value(i, RefA, ReferenceA[i], "PtrA");
check_value(i, RefB, ReferenceB[i], "PtrB");
T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
T RefB = 10 + i + Iterations * ((i * 2) + LocalSize);
assert(check_value(i, RefA, DataA[i], "PtrA"));
assert(check_value(i, RefB, DataB[i], "PtrB"));
}

free(PtrA, Queue);
Expand Down
125 changes: 125 additions & 0 deletions sycl/test-e2e/Graph/Inputs/whole_update_local_acc_multi.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Tests whole graph update of nodes with 2 local accessors,
// and submission of the graph.

#include "../graph_common.hpp"

using T = int;

void add_graph_nodes(
exp_ext::command_graph<exp_ext::graph_state::modifiable> &Graph,
queue &Queue, size_t Size, size_t LocalSize, T *Ptr) {
auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
local_accessor<T, 1> LocalMemA(LocalSize, CGH);
local_accessor<T, 1> LocalMemB(LocalSize, CGH);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
auto LocalID = Item.get_local_linear_id();
auto GlobalID = Item.get_global_linear_id();
LocalMemA[LocalID] = GlobalID;
LocalMemB[LocalID] = Item.get_local_range(0);
Ptr[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID];
});
});

// Introduce value params so that local arguments are not contiguous indices
// when set as kernel arguments
T Constant1 = 2;
T Constant2 = 0;
T Constant3 = 0;
T Constant4 = 0;
auto NodeB = add_node(
Graph, Queue,
[&](handler &CGH) {
local_accessor<T, 1> LocalMemA(LocalSize, CGH);
local_accessor<T, 1> LocalMemB(LocalSize, CGH);

depends_on_helper(CGH, NodeA);

CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
auto LocalID = Item.get_local_linear_id();
auto GlobalID = Item.get_global_linear_id();
LocalMemA[LocalID] = GlobalID + Constant2 + Constant3;
LocalMemB[LocalID] = Constant1 + Constant4;
Ptr[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID];
});
},
NodeA);
}
int main() {
queue Queue{};

const size_t LocalSize = 128;

std::vector<T> DataA(Size), DataB(Size);

std::iota(DataA.begin(), DataA.end(), 10);
std::iota(DataB.begin(), DataB.end(), 10);

exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);
T *PtrB = malloc_device<T>(Size, Queue);

Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(DataB.data(), PtrB, Size);
Queue.wait_and_throw();

size_t GraphALocalSize = LocalSize / 2;
add_graph_nodes(GraphA, Queue, Size, GraphALocalSize, PtrA);

auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{});

// Create second graph for whole graph update with a different local size
exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()};
add_graph_nodes(GraphB, Queue, Size, LocalSize, PtrB);

// Execute graphs before updating and check outputs
for (unsigned n = 0; n < Iterations; n++) {
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); });
}

Queue.wait_and_throw();

Queue.copy(PtrA, DataA.data(), Size);
Queue.copy(PtrB, DataB.data(), Size);
Queue.wait_and_throw();

for (size_t i = 0; i < Size; i++) {
T Init = 10 + i;
T NodeA = i * GraphALocalSize;
T NodeB = i * 2;
T RefA = Init + Iterations * (NodeA + NodeB);
assert(check_value(i, RefA, DataA[i], "PtrA"));
assert(check_value(i, Init, DataB[i], "PtrB"));
}

// Update GraphExecA using whole graph update

GraphExecA.update(GraphB);

// Execute graphs again and check outputs
for (unsigned n = 0; n < Iterations; n++) {
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); });
}

Queue.wait_and_throw();

Queue.copy(PtrA, DataA.data(), Size);
Queue.copy(PtrB, DataB.data(), Size);
Queue.wait_and_throw();

for (size_t i = 0; i < Size; i++) {
T Init = 10 + i;
T NodeAGraphA = i * GraphALocalSize;
T NodeAGraphB = i * LocalSize;
T NodeB = i * 2;
T RefA = Init + Iterations * (NodeAGraphA + NodeB);
T RefB = Init + Iterations * (NodeAGraphB + NodeB);
assert(check_value(i, RefA, DataA[i], "PtrA"));
assert(check_value(i, RefB, DataB[i], "PtrB"));
}

free(PtrA, Queue);
free(PtrB, Queue);
return 0;
}
Loading

0 comments on commit 731cdc0

Please sign in to comment.