intel · sarnex · Dec 3, 2024 · Nov 8, 2024 · Dec 2, 2024 · Dec 3, 2024
@@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT)
       CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
   endfunction()
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime")
+  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
   include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)
 
   set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")

@@ -1,7 +1,7 @@
-# commit 7a38cc3e21d83940622d63b0b902bf68f9539f6f
-# Merge: 2dac0daaf327 252b3822f0e8
-# Author: Martin Grant <martin.morrisongrant@codeplay.com>
-# Date:   Mon Dec 2 15:41:23 2024 +0000
-#     Merge pull request #2395 from pbalcer/fix-event-pooling
-#     fix event caching
-set(UNIFIED_RUNTIME_TAG 7a38cc3e21d83940622d63b0b902bf68f9539f6f)
+# commit 2bea25d7d5404ce82f36caf91c359f65b25187d7
+# Merge: 0b5d8f9e e578228a
+# Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
+# Date:   Tue Dec 3 12:41:16 2024 +0000
+#     Merge pull request #2298 from Bensuo/ewan/cuda_update_local_size
+#     Improve CUDA/HIP local argument handling
+set(UNIFIED_RUNTIME_TAG 2bea25d7d5404ce82f36caf91c359f65b25187d7)
@@ -1512,7 +1512,10 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
   PtrDescs.reserve(MaskedArgs.size());
   ValueDescs.reserve(MaskedArgs.size());
 
-  ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc;
+  ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc{};
+  UpdateDesc.stype =
+      UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC;
+  UpdateDesc.pNext = nullptr;
 
   // Collect arg descriptors and fill kernel launch descriptor
   using sycl::detail::kernel_param_kind_t;

@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/local_accessor_multiple_accessors.cpp"
@@ -0,0 +1,10 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/local_accessor_multiple_nodes.cpp"
@@ -0,0 +1,12 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// REQUIRES: ocloc && (opencl || level_zero)
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/opencl_local_acc.cpp"
@@ -10,20 +10,18 @@ int main() {
 
   const size_t LocalSize = 128;
 
-  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+  std::vector<T> HostData(Size);
 
-  std::iota(DataA.begin(), DataA.end(), 10);
-
-  std::vector<T> ReferenceA(DataA);
+  std::iota(HostData.begin(), HostData.end(), 10);
 
   exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
 
   T *PtrA = malloc_device<T>(Size, Queue);
 
-  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(HostData.data(), PtrA, Size);
   Queue.wait_and_throw();
 
-  auto node = add_node(Graph, Queue, [&](handler &CGH) {
+  auto Node = add_node(Graph, Queue, [&](handler &CGH) {
     local_accessor<T, 1> LocalMem(LocalSize, CGH);
 
     CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
@@ -40,14 +38,14 @@ int main() {
 
   Queue.wait_and_throw();
 
-  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrA, HostData.data(), Size);
   Queue.wait_and_throw();
 
   free(PtrA, Queue);
 
   for (size_t i = 0; i < Size; i++) {
-    T Ref = 10 + i + (i * 2);
-    check_value(i, Ref, ReferenceA[i], "PtrA");
+    T Ref = 10 + i + (Iterations * (i * 2));
+    assert(check_value(i, Ref, HostData[i], "PtrA"));
   }
 
   return 0;

@@ -0,0 +1,62 @@
+// Tests adding of nodes with more than one local accessor,
+// and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue{};
+
+  using T = int;
+
+  const size_t LocalSize = 128;
+
+  std::vector<T> HostData(Size);
+
+  std::iota(HostData.begin(), HostData.end(), 10);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrIn = malloc_device<T>(Size, Queue);
+  T *PtrOut = malloc_device<T>(Size, Queue);
+
+  Queue.memset(PtrOut, 0, Size * sizeof(T));
+  Queue.copy(HostData.data(), PtrIn, Size);
+  Queue.wait_and_throw();
+
+  auto Node = add_node(Graph, Queue, [&](handler &CGH) {
+    local_accessor<T, 1> LocalMemA(LocalSize, CGH);
+    local_accessor<T, 1> LocalMemB(LocalSize, CGH);
+
+    CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
+      auto LocalID = Item.get_local_linear_id();
+      auto GlobalID = Item.get_global_linear_id();
+      LocalMemA[LocalID] = GlobalID;
+      LocalMemB[LocalID] = PtrIn[GlobalID];
+      PtrOut[GlobalID] += LocalMemA[LocalID] * LocalMemB[LocalID];
+    });
+  });
+
+  auto GraphExec = Graph.finalize();
+
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrOut, HostData.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrIn, Queue);
+  free(PtrOut, Queue);
+
+  for (size_t i = 0; i < Size; i++) {
+    T Ref = 0;
+    for (size_t n = 0; n < Iterations; n++) {
+      Ref += (i * (10 + i));
+    }
+    assert(check_value(i, Ref, HostData[i], "PtrOut"));
+  }
+
+  return 0;
+}
@@ -0,0 +1,71 @@
+// Test creating a graph where more than one nodes uses local accessors,
+// and submits of the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue{};
+
+  using T = int;
+
+  const size_t LocalSize = 128;
+
+  std::vector<T> HostData(Size);
+
+  std::iota(HostData.begin(), HostData.end(), 10);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+
+  Queue.copy(HostData.data(), PtrA, Size);
+  Queue.wait_and_throw();
+
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    local_accessor<T, 1> LocalMem(LocalSize, CGH);
+
+    CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
+      LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2;
+      PtrA[Item.get_global_linear_id()] += LocalMem[Item.get_local_linear_id()];
+    });
+  });
+
+  auto NodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        local_accessor<T, 1> LocalMem(LocalSize, CGH);
+        depends_on_helper(CGH, NodeA);
+
+        CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) {
+          LocalMem[Item.get_local_linear_id()] = 3;
+          PtrA[Item.get_global_linear_id()] *=
+              LocalMem[Item.get_local_linear_id()];
+        });
+      },
+      NodeA);
+
+  auto GraphExec = Graph.finalize();
+
+  for (unsigned n = 0; n < Iterations; n++) {
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, HostData.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+
+  for (size_t i = 0; i < Size; i++) {
+    T Ref = 10 + i;
+
+    for (size_t n = 0; n < Iterations; n++) {
+      Ref += i * 2;
+      Ref *= 3;
+    }
+    assert(check_value(i, Ref, HostData[i], "PtrA"));
+  }
+
+  return 0;
+}
@@ -0,0 +1,78 @@
+// Tests using an OpenCL-C defined kernel with multiple local accessors
+
+#include "../graph_common.hpp"
+
+using source_kb = sycl::kernel_bundle<sycl::bundle_state::ext_oneapi_source>;
+using exe_kb = sycl::kernel_bundle<sycl::bundle_state::executable>;
+
+auto constexpr LocalAccCLSource = R"===(
+    kernel void test_la(__global int *out, __local int* local_ptr1,
+                        __local int2* local_ptr2, int n) {
+        __local int4 local_data[1];
+
+        size_t gid = get_global_id(0);
+        size_t lid = get_local_id(0);
+        size_t wg_size = get_num_groups(0);
+
+        local_ptr1[lid] = lid;
+        local_ptr2[lid].x = n;
+        local_ptr2[lid].y = wg_size;
+
+        if (lid == 0) {
+          local_data[lid] = (int4)(0xA, 0xB, 0xC, 0xD);
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int acc = local_data[0].x + local_data[0].y + local_data[0].z +
+                  local_data[0].w;
+        out[gid] = (local_ptr1[lid] * local_ptr2[lid].x) +
+                   (local_ptr2[lid].y * acc);
+    }
+)===";
+
+int main() {
+  queue Queue;
+
+  source_kb kbSrc = exp_ext::create_kernel_bundle_from_source(
+      Queue.get_context(), exp_ext::source_language::opencl, LocalAccCLSource);
+  exe_kb kbExe1 = exp_ext::build(kbSrc);
+  sycl::kernel test_kernel = kbExe1.ext_oneapi_get_kernel("test_la");
+
+  exp_ext::command_graph Graph{Queue};
+
+  int32_t *Ptr = malloc_device<int32_t>(Size, Queue);
+
+  int32_t N = 42;
+  constexpr size_t LocalSize = 256;
+  auto Node = add_node(Graph, Queue, [&](handler &cgh) {
+    local_accessor<int32_t, 1> acc_local1(LocalSize, cgh);
+    local_accessor<sycl::int2, 1> acc_local2(LocalSize, cgh);
+
+    cgh.set_arg(0, Ptr);
+    cgh.set_arg(1, acc_local1);
+    cgh.set_arg(2, acc_local2);
+    cgh.set_arg(3, N);
+
+    cgh.parallel_for(nd_range<1>(Size, LocalSize), test_kernel);
+  });
+
+  auto ExecGraph = Graph.finalize();
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  std::vector<int32_t> HostData(Size);
+  Queue.copy(Ptr, HostData.data(), Size).wait();
+
+  constexpr int32_t Acc = 0xA + 0xB + 0xC + 0xD;
+  constexpr int32_t WorkGroups = Size / LocalSize;
+  constexpr int32_t Tmp = Acc * WorkGroups;
+  for (size_t i = 0; i < Size; i++) {
+    int32_t local_id = i % LocalSize;
+    int32_t Ref = (local_id * N) + Tmp;
+    assert(HostData[i] == Ref);
+  }
+
+  sycl::free(Ptr, Queue);
+
+  return 0;
+}
@@ -28,8 +28,6 @@ int main() {
   std::iota(DataA.begin(), DataA.end(), 10);
   std::iota(DataB.begin(), DataB.end(), 10);
 
-  std::vector<T> ReferenceA(DataA), ReferenceB(DataB);
-
   exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
 
   T *PtrA = malloc_device<T>(Size, Queue);
@@ -59,10 +57,10 @@ int main() {
   Queue.wait_and_throw();
 
   for (size_t i = 0; i < Size; i++) {
-    T RefA = 10 + i + (i * 2) + LocalSize / 2;
+    T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
     T RefB = 10 + i;
-    check_value(i, RefA, ReferenceA[i], "PtrA");
-    check_value(i, RefB, ReferenceB[i], "PtrB");
+    assert(check_value(i, RefA, DataA[i], "PtrA"));
+    assert(check_value(i, RefB, DataB[i], "PtrB"));
   }
 
   // Update GraphExecA using whole graph update
@@ -81,10 +79,10 @@ int main() {
   Queue.wait_and_throw();
 
   for (size_t i = 0; i < Size; i++) {
-    T RefA = 10 + i + (i * 2) + LocalSize / 2;
-    T RefB = 10 + i + (i * 2) + LocalSize;
-    check_value(i, RefA, ReferenceA[i], "PtrA");
-    check_value(i, RefB, ReferenceB[i], "PtrB");
+    T RefA = 10 + i + Iterations * ((i * 2) + (LocalSize / 2));
+    T RefB = 10 + i + Iterations * ((i * 2) + LocalSize);
+    assert(check_value(i, RefA, DataA[i], "PtrA"));
+    assert(check_value(i, RefB, DataB[i], "PtrB"));
   }
 
   free(PtrA, Queue);