[SYCL][Graph] Fix CUDA local mem argument update bug

intel · Nov 11, 2024 · 731cdc0 · 731cdc0
1 parent 6d1f4fc
commit 731cdc0
Show file tree

Hide file tree

Showing 14 changed files with 338 additions and 21 deletions.
diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
@@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
       CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
   endfunction()
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git")
   include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)
 
   set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")

diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake
@@ -4,4 +4,4 @@
 # Date:   Mon Nov 11 13:35:24 2024 +0000
 #     Merge pull request #2112 from martygrant/martin/context-cts-spec-gap
 #     Improvements to align CTS and Spec for Context
-set(UNIFIED_RUNTIME_TAG 2eae687a4cf24ba02ee8e9ebb9552c1d392972ee)
+set(UNIFIED_RUNTIME_TAG "ewan/cuda_update_local_size")
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
@@ -1510,7 +1510,10 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
   PtrDescs.reserve(MaskedArgs.size());
   ValueDescs.reserve(MaskedArgs.size());
 
-  ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc;
+  ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc{};
+  UpdateDesc.stype =
+      UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC;
+  UpdateDesc.pNext = nullptr;
 
   // Collect arg descriptors and fill kernel launch descriptor
   using sycl::detail::kernel_param_kind_t;

diff --git a/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_accessors.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/local_accessor_multiple_accessors.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp b/sycl/test-e2e/Graph/Explicit/local_accessor_multiple_nodes.cpp
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/local_accessor_multiple_nodes.cpp"
diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor.cpp
@@ -10,20 +10,18 @@ int main() {
 
   const size_t LocalSize = 128;
 
-  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+  std::vector<T> HostData(Size);
 
-  std::iota(DataA.begin(), DataA.end(), 10);
-
-  std::vector<T> ReferenceA(DataA);
+  std::iota(HostData.begin(), HostData.end(), 10);
 
   exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
 
   T *PtrA = malloc_device<T>(Size, Queue);
 
-  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(HostData.data(), PtrA, Size);
   Queue.wait_and_throw();
 
-  auto node = add_node(Graph, Queue, [&](handler &CGH) {
+  auto Node = add_node(Graph, Queue, [&](handler &CGH) {
     local_accessor<T, 1> LocalMem(LocalSize, CGH);
 
     CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
@@ -40,14 +38,14 @@ int main() {
 
   Queue.wait_and_throw();
 
-  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrA, HostData.data(), Size);
   Queue.wait_and_throw();
 
   free(PtrA, Queue);
 
   for (size_t i = 0; i < Size; i++) {
-    T Ref = 10 + i + (i * 2);
-    check_value(i, Ref, ReferenceA[i], "PtrA");
+    T Ref = 10 + i + (Iterations * (i * 2));
+    assert(check_value(i, Ref, HostData[i], "PtrA"));
   }
 
   return 0;

diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_accessors.cpp
@@ -0,0 +1,62 @@
+// Tests adding of nodes with more than one local accessor,
+// and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue{};
+
+  using T = int;
+
+  const size_t LocalSize = 128;
+
+  std::vector<T> HostData(Size);
+
+  std::iota(HostData.begin(), HostData.end(), 10);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrIn = malloc_device<T>(Size, Queue);
+  T *PtrOut = malloc_device<T>(Size, Queue);
+
+  Queue.memset(PtrOut, 0, Size * sizeof(T));
+  Queue.copy(HostData.data(), PtrIn, Size);
+  Queue.wait_and_throw();
+
+  auto Node = add_node(Graph, Queue, [&](handler &CGH) {
+    local_accessor<T, 1> LocalMemA(LocalSize, CGH);
+    local_accessor<T, 1> LocalMemB(LocalSize, CGH);
+
+    CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
+      auto LocalID = Item.get_local_linear_id();
+      auto GlobalID = Item.get_global_linear_id();
+      LocalMemA[LocalID] = GlobalID;
+      LocalMemB[LocalID] = PtrIn[GlobalID];
+      PtrOut[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID];
+    });
+  });
+
+  auto GraphExec = Graph.finalize();
+
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrOut, HostData.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrIn, Queue);
+  free(PtrOut, Queue);
+
+  for (size_t i = 0; i < Size; i++) {
+    T Ref = 0;
+    for (size_t n = 0; n < Iterations; n++) {
+      Ref += (i * (10 + i));
+    }
+    assert(check_value(i, Ref, HostData[i], "PtrOut"));
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp b/sycl/test-e2e/Graph/Inputs/local_accessor_multiple_nodes.cpp
@@ -0,0 +1,71 @@
+// Test creating a graph where more than one nodes uses local accessors,
+// and submits of the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue{};
+
+  using T = int;
+
+  const size_t LocalSize = 128;
+
+  std::vector<T> HostData(Size);
+
+  std::iota(HostData.begin(), HostData.end(), 10);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+
+  Queue.copy(HostData.data(), PtrA, Size);
+  Queue.wait_and_throw();
+
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    local_accessor<T, 1> LocalMem(LocalSize, CGH);
+
+    CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
+      LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2;
+      PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()];
+    });
+  });
+
+  auto NodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        local_accessor<T, 1> LocalMem(LocalSize, CGH);
+        depends_on_helper(CGH, NodeA);
+
+        CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
+          LocalMem[Item.get_local_linear_id()] = 3;
+          PtrA[Item.get_global_linear_id()] *=
+              LocalMem[Item.get_local_linear_id()];
+        });
+      },
+      NodeA);
+
+  auto GraphExec = Graph.finalize();
+
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, HostData.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+
+  for (size_t i = 0; i < Size; i++) {
+    T Ref = 10 + i;
+
+    for (size_t n = 0; n < Iterations; n++) {
+      Ref += i * 2;
+      Ref *= 3;
+    }
+    assert(check_value(i, Ref, HostData[i], "PtrA"));
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc.cpp
@@ -28,8 +28,6 @@ int main() {
   std::iota(DataA.begin(), DataA.end(), 10);
   std::iota(DataB.begin(), DataB.end(), 10);
 
-  std::vector<T> ReferenceA(DataA), ReferenceB(DataB);
-
   exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
 
   T *PtrA = malloc_device<T>(Size, Queue);
@@ -59,10 +57,10 @@ int main() {
   Queue.wait_and_throw();
 
   for (size_t i = 0; i < Size; i++) {
-    T RefA = 10 + i + (i * 2) + LocalSize / 2;
+    T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
     T RefB = 10 + i;
-    check_value(i, RefA, ReferenceA[i], "PtrA");
-    check_value(i, RefB, ReferenceB[i], "PtrB");
+    assert(check_value(i, RefA, DataA[i], "PtrA"));
+    assert(check_value(i, RefB, DataB[i], "PtrB"));
   }
 
   // Update GraphExecA using whole graph update
@@ -81,10 +79,10 @@ int main() {
   Queue.wait_and_throw();
 
   for (size_t i = 0; i < Size; i++) {
-    T RefA = 10 + i + (i * 2) + LocalSize / 2;
-    T RefB = 10 + i + (i * 2) + LocalSize;
-    check_value(i, RefA, ReferenceA[i], "PtrA");
-    check_value(i, RefB, ReferenceB[i], "PtrB");
+    T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
+    T RefB = 10 + i + Iterations * ((i * 2) + LocalSize);
+    assert(check_value(i, RefA, DataA[i], "PtrA"));
+    assert(check_value(i, RefB, DataB[i], "PtrB"));
   }
 
   free(PtrA, Queue);

diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_local_acc_multi.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_local_acc_multi.cpp
@@ -0,0 +1,125 @@
+// Tests whole graph update of nodes with 2 local accessors,
+// and submission of the graph.
+
+#include "../graph_common.hpp"
+
+using T = int;
+
+void add_graph_nodes(
+    exp_ext::command_graph<exp_ext::graph_state::modifiable> &Graph,
+    queue &Queue, size_t Size, size_t LocalSize, T *Ptr) {
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    local_accessor<T, 1> LocalMemA(LocalSize, CGH);
+    local_accessor<T, 1> LocalMemB(LocalSize, CGH);
+
+    CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
+      auto LocalID = Item.get_local_linear_id();
+      auto GlobalID = Item.get_global_linear_id();
+      LocalMemA[LocalID] = GlobalID;
+      LocalMemB[LocalID] = Item.get_local_range(0);
+      Ptr[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID];
+    });
+  });
+
+  // Introduce value params so that local arguments are not contiguous indices
+  // when set as kernel arguments
+  T Constant1 = 2;
+  T Constant2 = 0;
+  T Constant3 = 0;
+  T Constant4 = 0;
+  auto NodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        local_accessor<T, 1> LocalMemA(LocalSize, CGH);
+        local_accessor<T, 1> LocalMemB(LocalSize, CGH);
+
+        depends_on_helper(CGH, NodeA);
+
+        CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
+          auto LocalID = Item.get_local_linear_id();
+          auto GlobalID = Item.get_global_linear_id();
+          LocalMemA[LocalID] = GlobalID + Constant2 + Constant3;
+          LocalMemB[LocalID] = Constant1 + Constant4;
+          Ptr[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID];
+        });
+      },
+      NodeA);
+}
+int main() {
+  queue Queue{};
+
+  const size_t LocalSize = 128;
+
+  std::vector<T> DataA(Size), DataB(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 10);
+  std::iota(DataB.begin(), DataB.end(), 10);
+
+  exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.wait_and_throw();
+
+  size_t GraphALocalSize = LocalSize / 2;
+  add_graph_nodes(GraphA, Queue, Size, GraphALocalSize, PtrA);
+
+  auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{});
+
+  // Create second graph for whole graph update with a different local size
+  exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()};
+  add_graph_nodes(GraphB, Queue, Size, LocalSize, PtrB);
+
+  // Execute graphs before updating and check outputs
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.wait_and_throw();
+
+  for (size_t i = 0; i < Size; i++) {
+    T Init = 10 + i;
+    T NodeA = i * GraphALocalSize;
+    T NodeB = i * 2;
+    T RefA = Init + Iterations * (NodeA + NodeB);
+    assert(check_value(i, RefA, DataA[i], "PtrA"));
+    assert(check_value(i, Init, DataB[i], "PtrB"));
+  }
+
+  // Update GraphExecA using whole graph update
+
+  GraphExecA.update(GraphB);
+
+  // Execute graphs again and check outputs
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExecA); });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.wait_and_throw();
+
+  for (size_t i = 0; i < Size; i++) {
+    T Init = 10 + i;
+    T NodeAGraphA = i * GraphALocalSize;
+    T NodeAGraphB = i * LocalSize;
+    T NodeB = i * 2;
+    T RefA = Init + Iterations * (NodeAGraphA + NodeB);
+    T RefB = Init + Iterations * (NodeAGraphB + NodeB);
+    assert(check_value(i, RefA, DataA[i], "PtrA"));
+    assert(check_value(i, RefB, DataB[i], "PtrB"));
+  }
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  return 0;
+}