diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 72841724fa01d..202bfdcdc19e3 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 1d3935f9e5e2a..e73e7a3d52331 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1 @@ -# commit 3a5b23c8b475712f9107c1d5ab41f27a1465578e -# Merge: f9f71f17 1696524d -# Author: Piotr Balcer -# Date: Thu Nov 14 14:38:05 2024 +0100 -# Merge pull request #2253 from pbalcer/low-power-events -# add low-power events experimental extension spec -set(UNIFIED_RUNTIME_TAG 3a5b23c8b475712f9107c1d5ab41f27a1465578e) +set(UNIFIED_RUNTIME_TAG "ewan/cuda_update_local_size") diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 861ec2a883601..d723d8d83511f 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1510,7 +1510,10 @@ void exec_graph_impl::updateImpl(std::shared_ptr Node) { PtrDescs.reserve(MaskedArgs.size()); ValueDescs.reserve(MaskedArgs.size()); - ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc; + ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc{}; + UpdateDesc.stype = + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC; + UpdateDesc.pNext = nullptr; // Collect arg descriptors and fill kernel launch descriptor using sycl::detail::kernel_param_kind_t; diff --git a/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp new file mode 100644 index 0000000000000..c4d704216a082 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/local_accessor_multiple_accessors.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp new file mode 100644 index 0000000000000..83eff54307ea3 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/local_accessor_multiple_nodes.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp index b3ac9fde67b6e..b6f6a7dd34348 100644 --- a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp +++ b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp @@ -10,20 +10,18 @@ int main() { const size_t LocalSize = 128; - std::vector DataA(Size), DataB(Size), DataC(Size); + std::vector HostData(Size); - std::iota(DataA.begin(), DataA.end(), 10); - - std::vector ReferenceA(DataA); + std::iota(HostData.begin(), HostData.end(), 10); exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; T *PtrA = malloc_device(Size, Queue); - Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(HostData.data(), PtrA, Size); Queue.wait_and_throw(); - auto node = add_node(Graph, Queue, [&](handler &CGH) { + auto Node = add_node(Graph, Queue, [&](handler &CGH) { local_accessor LocalMem(LocalSize, CGH); CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { @@ -40,14 +38,14 @@ int main() { Queue.wait_and_throw(); - Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrA, HostData.data(), Size); Queue.wait_and_throw(); free(PtrA, Queue); for (size_t i = 0; i < Size; i++) { - T Ref = 10 + i + (i * 2); - check_value(i, Ref, ReferenceA[i], "PtrA"); + T Ref = 10 + i + (Iterations * (i * 2)); + assert(check_value(i, Ref, HostData[i], "PtrA")); } return 0; diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp new file mode 100644 index 0000000000000..0ae67a4469a8c --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp @@ -0,0 +1,62 @@ +// Tests adding of nodes with more than one local accessor, +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + using T = int; + + const size_t LocalSize = 128; + + std::vector HostData(Size); + + std::iota(HostData.begin(), HostData.end(), 10); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrIn = malloc_device(Size, Queue); + T *PtrOut = malloc_device(Size, Queue); + + Queue.memset(PtrOut, 0, Size * sizeof(T)); + Queue.copy(HostData.data(), PtrIn, Size); + Queue.wait_and_throw(); + + auto Node = add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMemA(LocalSize, CGH); + local_accessor LocalMemB(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + auto LocalID = Item.get_local_linear_id(); + auto GlobalID = Item.get_global_linear_id(); + LocalMemA[LocalID] = GlobalID; + LocalMemB[LocalID] = PtrIn[GlobalID]; + PtrOut[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID]; + }); + }); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrOut, HostData.data(), Size); + Queue.wait_and_throw(); + + free(PtrIn, Queue); + free(PtrOut, Queue); + + for (size_t i = 0; i < Size; i++) { + T Ref = 0; + for (size_t n = 0; n < Iterations; n++) { + Ref += (i * (10 + i)); + } + assert(check_value(i, Ref, HostData[i], "PtrOut")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp new file mode 100644 index 0000000000000..8d38824c680ae --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp @@ -0,0 +1,71 @@ +// Test creating a graph where more than one nodes uses local accessors, +// and submits of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue{}; + + using T = int; + + const size_t LocalSize = 128; + + std::vector HostData(Size); + + std::iota(HostData.begin(), HostData.end(), 10); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + + Queue.copy(HostData.data(), PtrA, Size); + Queue.wait_and_throw(); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMem(LocalSize, CGH); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()]; + }); + }); + + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + local_accessor LocalMem(LocalSize, CGH); + depends_on_helper(CGH, NodeA); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + LocalMem[Item.get_local_linear_id()] = 3; + PtrA[Item.get_global_linear_id()] *= + LocalMem[Item.get_local_linear_id()]; + }); + }, + NodeA); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, HostData.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + + for (size_t i = 0; i < Size; i++) { + T Ref = 10 + i; + + for (size_t n = 0; n < Iterations; n++) { + Ref += i * 2; + Ref *= 3; + } + assert(check_value(i, Ref, HostData[i], "PtrA")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp index 100792a2e4762..6feb497115b0e 100644 --- a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp +++ b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp @@ -28,8 +28,6 @@ int main() { std::iota(DataA.begin(), DataA.end(), 10); std::iota(DataB.begin(), DataB.end(), 10); - std::vector ReferenceA(DataA), ReferenceB(DataB); - exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; T *PtrA = malloc_device(Size, Queue); @@ -59,10 +57,10 @@ int main() { Queue.wait_and_throw(); for (size_t i = 0; i < Size; i++) { - T RefA = 10 + i + (i * 2) + LocalSize / 2; + T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2)); T RefB = 10 + i; - check_value(i, RefA, ReferenceA[i], "PtrA"); - check_value(i, RefB, ReferenceB[i], "PtrB"); + assert(check_value(i, RefA, DataA[i], "PtrA")); + assert(check_value(i, RefB, DataB[i], "PtrB")); } // Update GraphExecA using whole graph update @@ -81,10 +79,10 @@ int main() { Queue.wait_and_throw(); for (size_t i = 0; i < Size; i++) { - T RefA = 10 + i + (i * 2) + LocalSize / 2; - T RefB = 10 + i + (i * 2) + LocalSize; - check_value(i, RefA, ReferenceA[i], "PtrA"); - check_value(i, RefB, ReferenceB[i], "PtrB"); + T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2)); + T RefB = 10 + i + Iterations * ((i * 2) + LocalSize); + assert(check_value(i, RefA, DataA[i], "PtrA")); + assert(check_value(i, RefB, DataB[i], "PtrB")); } free(PtrA, Queue); diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc_multi.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc_multi.cpp new file mode 100644 index 0000000000000..95f55736702a5 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc_multi.cpp @@ -0,0 +1,122 @@ +// Tests whole graph update of nodes with 2 local accessors, +// and submission of the graph. + +#include "../graph_common.hpp" + +using T = int; + +void add_graph_nodes( + exp_ext::command_graph &Graph, + queue &Queue, size_t Size, size_t LocalSize, T *Ptr) { + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + local_accessor LocalMemA(LocalSize, CGH); + local_accessor LocalMemB(LocalSize, CGH); + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + auto LocalID = Item.get_local_linear_id(); + auto GlobalID = Item.get_global_linear_id(); + LocalMemA[LocalID] = GlobalID; + LocalMemB[LocalID] = Item.get_local_range(0); + Ptr[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID]; + }); + }); + + // Introduce value params so that local arguments are not contiguous indices + // when set as kernel arguments + T Constant = 2; + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + local_accessor LocalMemA(LocalSize, CGH); + local_accessor LocalMemB(LocalSize * 2, CGH); + + depends_on_helper(CGH, NodeA); + + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + auto LocalID = Item.get_local_linear_id(); + auto GlobalID = Item.get_global_linear_id(); + LocalMemA[LocalID] = GlobalID; + LocalMemB[LocalID * 2] = Constant; + LocalMemB[(LocalID * 2) + 1] = Constant; + Ptr[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID * 2] * + LocalMemB[(LocalID * 2) + 1]; + }); + }, + NodeA); +} +int main() { + queue Queue{}; + + const size_t LocalSize = 128; + + std::vector DataA(Size), DataB(Size); + + std::iota(DataA.begin(), DataA.end(), 10); + std::iota(DataB.begin(), DataB.end(), 10); + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.wait_and_throw(); + + size_t GraphALocalSize = LocalSize / 2; + add_graph_nodes(GraphA, Queue, Size, GraphALocalSize, PtrA); + + auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{}); + + // Create second graph for whole graph update with a different local size + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + add_graph_nodes(GraphB, Queue, Size, LocalSize, PtrB); + + // Execute graphs before updating and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T Init = 10 + i; + T NodeA = i * GraphALocalSize; + T NodeB = i * 4; + T RefA = Init + Iterations * (NodeA + NodeB); + assert(check_value(i, RefA, DataA[i], "PtrA")); + assert(check_value(i, Init, DataB[i], "PtrB")); + } + + // Update GraphExecA using whole graph update + GraphExecA.update(GraphB); + + // Execute graphs again and check outputs + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.wait_and_throw(); + + for (size_t i = 0; i < Size; i++) { + T Init = 10 + i; + T NodeAGraphA = i * GraphALocalSize; + T NodeAGraphB = i * LocalSize; + T NodeB = i * 4; + T RefA = Init + Iterations * (NodeAGraphA + NodeB); + T RefB = Init + Iterations * (NodeAGraphB + NodeB); + assert(check_value(i, RefA, DataA[i], "PtrA")); + assert(check_value(i, RefB, DataB[i], "PtrB")); + } + + free(PtrA, Queue); + free(PtrB, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_accessors.cpp b/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_accessors.cpp new file mode 100644 index 0000000000000..aa044c80e19ec --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_accessors.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/local_accessor_multiple_accessors.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_nodes.cpp b/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_nodes.cpp new file mode 100644 index 0000000000000..5830c21a57431 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/local_accessor_multiple_nodes.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/local_accessor_multiple_nodes.cpp" diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp new file mode 100644 index 0000000000000..e1a9ccf9a941d --- /dev/null +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../../Inputs/whole_update_local_acc_multi.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp new file mode 100644 index 0000000000000..f953915379641 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../../Inputs/whole_update_local_acc_multi.cpp"