From 5b75d97f31ea2c66e4411377c0f2cc4210f8b3d2 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Mon, 22 Sep 2025 09:35:51 -0700
Subject: [PATCH 1/6] add test for partitioned wait

---
 .../Graph/RecordReplay/partitioned_wait.cpp   | 144 ++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp
diff --git a/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp b/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp
new file mode 100644
index 0000000000000..30db4c0822ebd
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp
@@ -0,0 +1,144 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+
+// Tests partitioned wait feature in SYCL Graph.
+// This test demonstrates how queue.wait() calls during recording create 
+// dummy nodes that partition the graph into "before", barrier, and "after" subgraphs.
+
+#include "../graph_common.hpp"
+
+#include <sycl/properties/all_properties.hpp>
+
+int main() {
+  property_list Properties{property::queue::in_order{}};
+  queue Queue{Properties};
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 100;
+  int *A = malloc_device<int>(N, Queue);
+  int *B = malloc_device<int>(N, Queue);
+  int *C = malloc_device<int>(N, Queue);
+  int *D = malloc_device<int>(N, Queue);
+
+  // Initialize data outside recording
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      A[it] = static_cast<int>(it);
+      B[it] = 0;
+      C[it] = 0;
+      D[it] = 0;
+    });
+  }).wait();
+
+  // Begin recording the graph
+  Graph.begin_recording(Queue);
+
+  // Part 1: "Before" subgraph operations
+  auto Event1 = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      B[it] = A[it] * 2; // B = A * 2
+    });
+  });
+
+  auto Event2 = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(Event1);
+    CGH.parallel_for(N, [=](id<1> it) {
+      C[it] = B[it] + 1; // C = B + 1 = A * 2 + 1
+    });
+  });
+
+  // This queue.wait() should create a dummy barrier node in the graph
+  // instead of asserting, partitioning the graph into before/after sections
+  Queue.wait();
+
+  // Part 2: "After" subgraph operations 
+  // These operations should be in the "after" partition
+  auto Event3 = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      D[it] = C[it] * 3; // D = C * 3 = (A * 2 + 1) * 3
+    });
+  });
+
+  // Another wait should create another barrier
+  Queue.wait();
+
+  // More operations after second barrier
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      D[it] = D[it] + A[it]; // D = D + A = (A * 2 + 1) * 3 + A = 6*A + 3 + A = 7*A + 3
+    });
+  });
+
+  Graph.end_recording();
+
+  // Finalize the graph - this should trigger partitioning
+  auto ExecGraph = Graph.finalize();
+
+  // Execute the partitioned graph
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+  Queue.wait_and_throw();
+
+  // Verify results
+  std::vector<int> OutputA(N), OutputB(N), OutputC(N), OutputD(N);
+  Queue.memcpy(OutputA.data(), A, N * sizeof(int)).wait();
+  Queue.memcpy(OutputB.data(), B, N * sizeof(int)).wait();
+  Queue.memcpy(OutputC.data(), C, N * sizeof(int)).wait();
+  Queue.memcpy(OutputD.data(), D, N * sizeof(int)).wait();
+
+  // Check results
+  for (size_t i = 0; i < N; i++) {
+    int expected_a = static_cast<int>(i);
+    int expected_b = expected_a * 2;
+    int expected_c = expected_b + 1;
+    int expected_d = 7 * expected_a + 3;
+
+    assert(check_value(i, expected_a, OutputA[i], "A"));
+    assert(check_value(i, expected_b, OutputB[i], "B"));
+    assert(check_value(i, expected_c, OutputC[i], "C"));
+    assert(check_value(i, expected_d, OutputD[i], "D"));
+  }
+
+  // Test multiple execution of the partitioned graph
+  // Reset data
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      A[it] = static_cast<int>(i) + 10; // Different input
+      B[it] = 0;
+      C[it] = 0;
+      D[it] = 0;
+    });
+  }).wait();
+
+  // Execute again
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+  Queue.wait_and_throw();
+
+  // Verify second execution
+  Queue.memcpy(OutputA.data(), A, N * sizeof(int)).wait();
+  Queue.memcpy(OutputB.data(), B, N * sizeof(int)).wait();
+  Queue.memcpy(OutputC.data(), C, N * sizeof(int)).wait();
+  Queue.memcpy(OutputD.data(), D, N * sizeof(int)).wait();
+
+  for (size_t i = 0; i < N; i++) {
+    int expected_a = static_cast<int>(i) + 10;
+    int expected_b = expected_a * 2;
+    int expected_c = expected_b + 1;
+    int expected_d = 7 * expected_a + 3;
+
+    assert(check_value(i, expected_a, OutputA[i], "A (second execution)"));
+    assert(check_value(i, expected_b, OutputB[i], "B (second execution)"));
+    assert(check_value(i, expected_c, OutputC[i], "C (second execution)"));
+    assert(check_value(i, expected_d, OutputD[i], "D (second execution)"));
+  }
+
+  // Clean up
+  sycl::free(A, Queue);
+  sycl::free(B, Queue);
+  sycl::free(C, Queue);
+  sycl::free(D, Queue);
+
+  return 0;
+}

From 77145f2587e7134f29209a8129d3fdf318b07801 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Mon, 22 Sep 2025 11:52:20 -0700
Subject: [PATCH 2/6] fix typo

---
 .../Graph/RecordReplay/partitioned_wait.cpp   | 25 +++++--------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp b/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp
index 30db4c0822ebd..e4ef0ff783862 100644
--- a/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp
@@ -4,8 +4,6 @@
 // RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 
 // Tests partitioned wait feature in SYCL Graph.
-// This test demonstrates how queue.wait() calls during recording create 
-// dummy nodes that partition the graph into "before", barrier, and "after" subgraphs.
 
 #include "../graph_common.hpp"
 
@@ -23,7 +21,6 @@ int main() {
   int *C = malloc_device<int>(N, Queue);
   int *D = malloc_device<int>(N, Queue);
 
-  // Initialize data outside recording
   Queue.submit([&](handler &CGH) {
     CGH.parallel_for(N, [=](id<1> it) {
       A[it] = static_cast<int>(it);
@@ -39,45 +36,38 @@ int main() {
   // Part 1: "Before" subgraph operations
   auto Event1 = Queue.submit([&](handler &CGH) {
     CGH.parallel_for(N, [=](id<1> it) {
-      B[it] = A[it] * 2; // B = A * 2
+      B[it] = A[it] * 2;
     });
   });
 
   auto Event2 = Queue.submit([&](handler &CGH) {
     CGH.depends_on(Event1);
     CGH.parallel_for(N, [=](id<1> it) {
-      C[it] = B[it] + 1; // C = B + 1 = A * 2 + 1
+      C[it] = B[it] + 1;
     });
   });
 
-  // This queue.wait() should create a dummy barrier node in the graph
-  // instead of asserting, partitioning the graph into before/after sections
+  // should create a dummy barrier node in the graph
   Queue.wait();
 
   // Part 2: "After" subgraph operations 
-  // These operations should be in the "after" partition
   auto Event3 = Queue.submit([&](handler &CGH) {
     CGH.parallel_for(N, [=](id<1> it) {
-      D[it] = C[it] * 3; // D = C * 3 = (A * 2 + 1) * 3
+      D[it] = C[it] * 3;
     });
   });
 
-  // Another wait should create another barrier
   Queue.wait();
 
-  // More operations after second barrier
   Queue.submit([&](handler &CGH) {
     CGH.parallel_for(N, [=](id<1> it) {
-      D[it] = D[it] + A[it]; // D = D + A = (A * 2 + 1) * 3 + A = 6*A + 3 + A = 7*A + 3
+      D[it] = D[it] + A[it]; 
     });
   });
 
   Graph.end_recording();
 
-  // Finalize the graph - this should trigger partitioning
   auto ExecGraph = Graph.finalize();
-
-  // Execute the partitioned graph
   Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
   Queue.wait_and_throw();
 
@@ -88,7 +78,6 @@ int main() {
   Queue.memcpy(OutputC.data(), C, N * sizeof(int)).wait();
   Queue.memcpy(OutputD.data(), D, N * sizeof(int)).wait();
 
-  // Check results
   for (size_t i = 0; i < N; i++) {
     int expected_a = static_cast<int>(i);
     int expected_b = expected_a * 2;
@@ -101,18 +90,16 @@ int main() {
     assert(check_value(i, expected_d, OutputD[i], "D"));
   }
 
-  // Test multiple execution of the partitioned graph
   // Reset data
   Queue.submit([&](handler &CGH) {
     CGH.parallel_for(N, [=](id<1> it) {
-      A[it] = static_cast<int>(i) + 10; // Different input
+      A[it] = static_cast<int>(it) + 10; // Different input
       B[it] = 0;
       C[it] = 0;
       D[it] = 0;
     });
   }).wait();
 
-  // Execute again
   Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
   Queue.wait_and_throw();
 

From 1a85b9c95397a2a881698fce81783d56e6b5498d Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Mon, 22 Sep 2025 12:49:04 -0700
Subject: [PATCH 3/6] add initial implementation with barriers

---
 sycl/source/detail/queue_impl.cpp             | 30 +++++++++++++++++--
 .../Graph/RecordReplay/partitioned_wait.cpp   |  8 ++---
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 52ee77d251eb2..93c49071a98c1 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -9,6 +9,9 @@
 #include <detail/event_impl.hpp>
 #include <detail/memory_manager.hpp>
 #include <detail/queue_impl.hpp>
+#include <detail/cg.hpp>
+#include <detail/graph/graph_impl.hpp>
+#include <detail/graph/node_impl.hpp>
 #include <sycl/context.hpp>
 #include <sycl/detail/common.hpp>
 #include <sycl/detail/ur.hpp>
@@ -588,9 +591,30 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
 #endif
 
   if (!MGraph.expired()) {
-    throw sycl::exception(make_error_code(errc::invalid),
-                          "wait cannot be called for a queue which is "
-                          "recording to a command graph.");
+    auto GraphImpl = MGraph.lock();
+
+    // Add a barrier node to the graph to create a partition point.
+    // TODO: test if partitioned wait bits are set
+    if (GraphImpl) {
+      std::vector<detail::EventImplPtr> EmptyWaitList;
+      auto BarrierCG = std::make_shared<detail::CGBarrier>(
+          std::move(EmptyWaitList),
+          ext::oneapi::experimental::event_mode_enum::none,
+          detail::CG::StorageInitHelper{},
+          detail::CGType::BarrierWaitlist,
+          CodeLoc
+      );
+
+      std::vector<ext::oneapi::experimental::detail::node_impl *> EmptyDeps;
+      ext::oneapi::experimental::detail::node_impl &BarrierNode = GraphImpl->add(
+          ext::oneapi::experimental::node_type::ext_oneapi_barrier,
+          std::static_pointer_cast<detail::CG>(BarrierCG),
+          EmptyDeps
+      );
+
+      GraphImpl->setBarrierDep(shared_from_this(), BarrierNode);
+    }
+    return;
   }
 
   // If there is an external event set, we know we are using an in-order queue
diff --git a/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp b/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp
index e4ef0ff783862..695e7a3fd8ce0 100644
--- a/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/partitioned_wait.cpp
@@ -82,7 +82,7 @@ int main() {
     int expected_a = static_cast<int>(i);
     int expected_b = expected_a * 2;
     int expected_c = expected_b + 1;
-    int expected_d = 7 * expected_a + 3;
+    int expected_d = expected_c * 3 + expected_a;
 
     assert(check_value(i, expected_a, OutputA[i], "A"));
     assert(check_value(i, expected_b, OutputB[i], "B"));
@@ -90,7 +90,7 @@ int main() {
     assert(check_value(i, expected_d, OutputD[i], "D"));
   }
 
-  // Reset data
+  // Reset data and verify with new input
   Queue.submit([&](handler &CGH) {
     CGH.parallel_for(N, [=](id<1> it) {
       A[it] = static_cast<int>(it) + 10; // Different input
@@ -103,7 +103,6 @@ int main() {
   Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
   Queue.wait_and_throw();
 
-  // Verify second execution
   Queue.memcpy(OutputA.data(), A, N * sizeof(int)).wait();
   Queue.memcpy(OutputB.data(), B, N * sizeof(int)).wait();
   Queue.memcpy(OutputC.data(), C, N * sizeof(int)).wait();
@@ -113,7 +112,7 @@ int main() {
     int expected_a = static_cast<int>(i) + 10;
     int expected_b = expected_a * 2;
     int expected_c = expected_b + 1;
-    int expected_d = 7 * expected_a + 3;
+    int expected_d = expected_c * 3 + expected_a;
 
     assert(check_value(i, expected_a, OutputA[i], "A (second execution)"));
     assert(check_value(i, expected_b, OutputB[i], "B (second execution)"));
@@ -121,7 +120,6 @@ int main() {
     assert(check_value(i, expected_d, OutputD[i], "D (second execution)"));
   }
 
-  // Clean up
   sycl::free(A, Queue);
   sycl::free(B, Queue);
   sycl::free(C, Queue);

From 9a52a846dfaeab3659c8fc864e3fe1a13513fab0 Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Mon, 22 Sep 2025 14:12:15 -0700
Subject: [PATCH 4/6] add alternative non-barrier implementation

---
 sycl/source/detail/queue_impl.cpp | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 93c49071a98c1..8197ef477c744 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -596,6 +596,9 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
     // Add a barrier node to the graph to create a partition point.
     // TODO: test if partitioned wait bits are set
     if (GraphImpl) {
+//#define PARTITIONED_WAIT_IMPL_BARRIERS 1
+#define PARTITIONED_WAIT_IMPL_EMPTY_NODE 1
+#if PARTITIONED_WAIT_IMPL_BARRIERS
       std::vector<detail::EventImplPtr> EmptyWaitList;
       auto BarrierCG = std::make_shared<detail::CGBarrier>(
           std::move(EmptyWaitList),
@@ -613,6 +616,20 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
       );
 
       GraphImpl->setBarrierDep(shared_from_this(), BarrierNode);
+#elif PARTITIONED_WAIT_IMPL_EMPTY_NODE
+      auto EmptyCG = std::make_shared<detail::CG>(
+          detail::CGType::None,
+          detail::CG::StorageInitHelper{},
+          CodeLoc
+      );
+
+      std::vector<ext::oneapi::experimental::detail::node_impl *> EmptyDeps;
+      ext::oneapi::experimental::detail::node_impl &EmptyNode = GraphImpl->add(
+          ext::oneapi::experimental::node_type::empty,
+          EmptyCG,
+          EmptyDeps
+      );
+#endif
     }
     return;
   }

From 4e76475585d618585dab6eddb8bbad7d3fec16ff Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Wed, 1 Oct 2025 13:02:42 -0700
Subject: [PATCH 5/6] introduce new host sync node

---
 .../ext/oneapi/experimental/graph/node.hpp    |   3 +-
 sycl/source/detail/event_impl.cpp             |  26 +++-
 sycl/source/detail/graph/graph_impl.cpp       | 138 +++++++++++++++---
 sycl/source/detail/queue_impl.cpp             |  30 +---
 4 files changed, 145 insertions(+), 52 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph/node.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph/node.hpp
index 91c756e32049f..fc9312569b436 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph/node.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph/node.hpp
@@ -42,7 +42,8 @@ enum class node_type {
   host_task = 9,
   native_command = 10,
   async_malloc = 11,
-  async_free = 12
+  async_free = 12,
+  host_sync = 13
 };
 
 /// Class representing a node in the graph, returned by command_graph::add().
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index b0c838cdd890c..f9124fd280c1f 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -278,10 +278,30 @@ void event_impl::wait(bool *Success) {
     throw sycl::exception(make_error_code(errc::invalid),
                           "wait method cannot be used for a discarded event.");
 
+    printf("FOOBAR3 %d\n", MGraph.expired());
+
+
   if (!MGraph.expired()) {
-    throw sycl::exception(make_error_code(errc::invalid),
-                          "wait method cannot be used for an event associated "
-                          "with a command graph.");
+    auto GraphImpl = MGraph.lock();
+
+    // Add a host sync node to the graph to create a partition point.
+    // TODO: test if partitioned wait bits are set
+    if (GraphImpl) {
+      auto EmptyCG = std::make_shared<detail::CG>(
+          detail::CGType::None,
+          detail::CG::StorageInitHelper{}
+      );
+
+      std::vector<ext::oneapi::experimental::detail::node_impl *> EmptyDeps;
+      ext::oneapi::experimental::detail::node_impl &HostSyncNode = GraphImpl->add(
+          ext::oneapi::experimental::node_type::host_sync,
+          EmptyCG,
+          EmptyDeps
+      );
+
+      printf("FOOBAR2\n");
+
+    }
   }
 
 #ifdef XPTI_ENABLE_INSTRUMENTATION
diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp
index 9a36bbb0b9476..75d9095648e58 100644
--- a/sycl/source/detail/graph/graph_impl.cpp
+++ b/sycl/source/detail/graph/graph_impl.cpp
@@ -18,6 +18,7 @@
 #include <detail/kernel_impl.hpp>                     // for kernel_impl
 #include <detail/program_manager/program_manager.hpp> // ProgramManager
 #include <detail/queue_impl.hpp>                      // for queue_impl
+#include <iostream>                                   // for debug output
 #include <detail/sycl_mem_obj_t.hpp>                  // for SYCLMemObjT
 #include <stack>                                      // for stack
 #include <sycl/detail/common.hpp>      // for tls_code_loc_t etc..
@@ -68,6 +69,8 @@ inline const char *nodeTypeToString(node_type NodeType) {
     return "async_malloc";
   case node_type::async_free:
     return "async_free";
+    case node_type::host_sync:
+    return "host_sync";
   }
   assert(false && "Unhandled node type");
   return {};
@@ -140,19 +143,19 @@ void propagatePartitionUp(node_impl &Node, int PartitionNum) {
 /// remain.
 /// @param Node Node to assign to the partition.
 /// @param PartitionNum Number to propagate.
-/// @param HostTaskList List of host tasks that have already been processed and
+/// @param CutVertexList List of tasks that have already been processed and
 /// are encountered as successors to the node Node.
 void propagatePartitionDown(node_impl &Node, int PartitionNum,
-                            std::list<node_impl *> &HostTaskList) {
+                            std::list<node_impl *> &CutVertexList) {
   if (Node.MCGType == sycl::detail::CGType::CodeplayHostTask) {
     if (Node.MPartitionNum != -1) {
-      HostTaskList.push_front(&Node);
+      CutVertexList.push_front(&Node);
     }
     return;
   }
   Node.MPartitionNum = PartitionNum;
   for (node_impl &Successor : Node.successors()) {
-    propagatePartitionDown(Successor, PartitionNum, HostTaskList);
+    propagatePartitionDown(Successor, PartitionNum, CutVertexList);
   }
 }
 
@@ -180,24 +183,50 @@ void partition::updateSchedule() {
 
 void exec_graph_impl::makePartitions() {
   int CurrentPartition = -1;
-  std::list<node_impl *> HostTaskList;
+  std::list<node_impl *> CutVertexList;
+
+#define SYCL_GRAPH_DEBUG 1
+#ifdef SYCL_GRAPH_DEBUG
+  // Debug: Print total number of nodes
+  std::cout << "[DEBUG] makePartitions: Starting with " << MNodeStorage.size() << " nodes" << std::endl;
+  
+  // Debug: Print all nodes and their types
+  int nodeIndex = 0;
+  for (node_impl &Node : nodes()) {
+    std::cout << "[DEBUG] Node " << nodeIndex << ": Type=" << nodeTypeToString(Node.MNodeType) 
+              << ", CGType=" << static_cast<int>(Node.MCGType) << std::endl;
+    nodeIndex++;
+  }
+#endif
+
+  // A cut vertex is a node that, when removed, increases the number of connected components
+  // in the graph. In our case, cut vertices are host-tasks / sync tasks that separate partitions
+  auto const IsCutVertex = [](node_impl const& node) {
+    return node.MCGType == sycl::detail::CGType::CodeplayHostTask || 
+           node.MNodeType == node_type::host_sync;
+  };
+  
   // find all the host-tasks in the graph
   for (node_impl &Node : nodes()) {
-    if (Node.MCGType == sycl::detail::CGType::CodeplayHostTask) {
-      HostTaskList.push_back(&Node);
+    if (IsCutVertex(Node)) {
+      CutVertexList.push_back(&Node);
     }
   }
 
-  MContainsHostTask = HostTaskList.size() > 0;
+  MContainsHostTask = CutVertexList.size() > 0;
+#ifdef SYCL_GRAPH_DEBUG
+  std::cout << "[DEBUG] Found " << CutVertexList.size() << " host tasks, MContainsHostTask=" 
+            << (MContainsHostTask ? "true" : "false") << std::endl;
+#endif
   // Annotate nodes
   // The first step in graph partitioning is to annotate all nodes of the graph
   // with a temporary partition or group number. This step allows us to group
   // the graph nodes into sets of nodes with kind of meta-dependencies that must
   // be enforced by the runtime. For example, Group 2 depends on Groups 0 and 1,
   // which means that we should not try to run Group 2 before Groups 0 and 1
-  // have finished executing. Since host-tasks are currently the only tasks that
+  // have finished executing. Since host-tasks and sync-tasks are the only tasks that
   // require runtime dependency handling, groups of nodes are created from
-  // host-task nodes. We therefore loop over all the host-task nodes, and for
+  // these nodes. We therefore loop over all the host-task and sync-task nodes, and for
   // each node:
   //  - Its predecessors are assigned to group number `n-1`
   //  - The node itself constitutes a group, group number `n`
@@ -213,23 +242,32 @@ void exec_graph_impl::makePartitions() {
   // case, the host-task node `A` must be reprocessed after the node `B` and the
   // group that includes the predecessor of `B` can be merged with the group of
   // the predecessors of the node `A`.
-  while (HostTaskList.size() > 0) {
-    node_impl &Node = *HostTaskList.front();
-    HostTaskList.pop_front();
+  while (CutVertexList.size() > 0) {
+    node_impl &Node = *CutVertexList.front();
+    CutVertexList.pop_front();
+#ifdef SYCL_GRAPH_DEBUG
+    std::cout << "[DEBUG] Processing host task node, CurrentPartition=" << CurrentPartition << std::endl;
+#endif
     CurrentPartition++;
     for (node_impl &Predecessor : Node.predecessors()) {
       propagatePartitionUp(Predecessor, CurrentPartition);
     }
     CurrentPartition++;
     Node.MPartitionNum = CurrentPartition;
+#ifdef SYCL_GRAPH_DEBUG
+    std::cout << "[DEBUG] Assigned host task to partition " << CurrentPartition << std::endl;
+#endif
     CurrentPartition++;
-    auto TmpSize = HostTaskList.size();
+    auto TmpSize = CutVertexList.size();
     for (node_impl &Successor : Node.successors()) {
-      propagatePartitionDown(Successor, CurrentPartition, HostTaskList);
+      propagatePartitionDown(Successor, CurrentPartition, CutVertexList);
     }
-    if (HostTaskList.size() > TmpSize) {
+    if (CutVertexList.size() > TmpSize) {
+#ifdef SYCL_GRAPH_DEBUG
+      std::cout << "[DEBUG] Host task list size increased, merging partitions" << std::endl;
+#endif
       // At least one HostTask has been re-numbered so group merge opportunities
-      for (node_impl *HT : HostTaskList) {
+      for (node_impl *HT : CutVertexList) {
         auto HTPartitionNum = HT->MPartitionNum;
         if (HTPartitionNum != -1) {
           // can merge predecessors of node `Node` with predecessors of node
@@ -246,14 +284,32 @@ void exec_graph_impl::makePartitions() {
     }
   }
 
+#ifdef SYCL_GRAPH_DEBUG
+  // Debug: Print node partition assignments before creating partitions
+  std::cout << "[DEBUG] Node partition assignments:" << std::endl;
+  nodeIndex = 0;
+  for (node_impl &Node : nodes()) {
+    std::cout << "[DEBUG] Node " << nodeIndex << ": Partition=" << Node.MPartitionNum 
+              << ", Type=" << nodeTypeToString(Node.MNodeType) << std::endl;
+    nodeIndex++;
+  }
+#endif
+
   // Create partitions
   int PartitionFinalNum = 0;
+#ifdef SYCL_GRAPH_DEBUG
+  std::cout << "[DEBUG] Creating partitions from " << -1 << " to " << CurrentPartition << std::endl;
+#endif
   for (int i = -1; i <= CurrentPartition; i++) {
     const std::shared_ptr<partition> &Partition = std::make_shared<partition>();
+    int nodesInPartition = 0;
+    int rootsInPartition = 0;
     for (node_impl &Node : nodes()) {
       if (Node.MPartitionNum == i) {
+        nodesInPartition++;
         MPartitionNodes[&Node] = PartitionFinalNum;
         if (isPartitionRoot(Node)) {
+          rootsInPartition++;
           Partition->MRoots.insert(&Node);
           if (Node.MCGType == CGType::CodeplayHostTask) {
             Partition->MIsHostTask = true;
@@ -261,21 +317,37 @@ void exec_graph_impl::makePartitions() {
         }
       }
     }
+#ifdef SYCL_GRAPH_DEBUG
+    std::cout << "[DEBUG] Partition " << i << ": " << nodesInPartition << " nodes, " 
+              << rootsInPartition << " roots" << std::endl;
+#endif
     if (Partition->MRoots.size() > 0) {
       Partition->updateSchedule();
       Partition->MIsInOrderGraph = Partition->checkIfGraphIsSinglePath();
       MPartitions.push_back(Partition);
       MRootPartitions.push_back(Partition);
+#ifdef SYCL_GRAPH_DEBUG
+      std::cout << "[DEBUG] Added partition " << PartitionFinalNum << " (original " << i 
+                << "), IsHostTask=" << (Partition->MIsHostTask ? "true" : "false") 
+                << ", IsInOrder=" << (Partition->MIsInOrderGraph ? "true" : "false") << std::endl;
+#endif
       PartitionFinalNum++;
     }
   }
 
   // Add an empty partition if there is no partition, i.e. empty graph
   if (MPartitions.empty()) {
+#ifdef SYCL_GRAPH_DEBUG
+    std::cout << "[DEBUG] No partitions created, adding empty partition" << std::endl;
+#endif
     MPartitions.push_back(std::make_shared<partition>());
     MRootPartitions.push_back(MPartitions[0]);
   }
 
+#ifdef SYCL_GRAPH_DEBUG
+  std::cout << "[DEBUG] Final result: " << MPartitions.size() << " partitions created" << std::endl;
+#endif
+
   // Make global schedule list
   for (const auto &Partition : MPartitions) {
     MSchedule.insert(MSchedule.end(), Partition->MSchedule.begin(),
@@ -283,22 +355,39 @@ void exec_graph_impl::makePartitions() {
   }
 
   // Compute partition dependencies
+  int partitionIdx = 0;
   for (const auto &Partition : MPartitions) {
+    int predecessorCount = 0;
+    int successorCount = 0;
     for (node_impl &Root : Partition->roots()) {
       for (node_impl &NodeDep : Root.predecessors()) {
         auto &Predecessor = MPartitions[MPartitionNodes[&NodeDep]];
         Partition->MPredecessors.push_back(Predecessor.get());
         Predecessor->MSuccessors.push_back(Partition.get());
+        predecessorCount++;
       }
     }
+    for (auto &Succ : Partition->MSuccessors) {
+      successorCount++;
+    }
+#ifdef SYCL_GRAPH_DEBUG
+    std::cout << "[DEBUG] Partition " << partitionIdx << " dependencies: " 
+              << predecessorCount << " predecessors, " << successorCount << " successors" << std::endl;
+#endif
+    partitionIdx++;
   }
 
   // Reset node groups (if node have to be re-processed - e.g. subgraph)
   for (node_impl &Node : nodes()) {
     Node.MPartitionNum = -1;
   }
+  
+#ifdef SYCL_GRAPH_DEBUG
+  std::cout << "[DEBUG] makePartitions completed" << std::endl;
+#endif
 }
 
+
 graph_impl::graph_impl(const sycl::context &SyclContext,
                        const sycl::device &SyclDevice,
                        const sycl::property_list &PropList)
@@ -685,6 +774,7 @@ std::vector<sycl::detail::EventImplPtr> graph_impl::getExitNodesEvents(
 
 void graph_impl::beginRecording(sycl::detail::queue_impl &Queue) {
   graph_impl::WriteLock Lock(MMutex);
+  printf("Graph %p beginRecording on Queue %p\n", this, &Queue);
   if (!Queue.hasCommandGraph()) {
     Queue.setCommandGraph(shared_from_this());
     addQueue(Queue);
@@ -924,9 +1014,9 @@ exec_graph_impl::~exec_graph_impl() {
 
 // Clean up any execution events which have finished so we don't pass them
 // to the scheduler.
-static void cleanupExecutionEvents(std::vector<EventImplPtr> &ExecutionEvents) {
+static void cleanupExecutionEvents(std::vector<sycl::detail::EventImplPtr> &ExecutionEvents) {
 
-  auto Predicate = [](EventImplPtr &EventPtr) {
+  auto Predicate = [](sycl::detail::EventImplPtr &EventPtr) {
     return EventPtr->isCompleted();
   };
 
@@ -935,7 +1025,7 @@ static void cleanupExecutionEvents(std::vector<EventImplPtr> &ExecutionEvents) {
       ExecutionEvents.end());
 }
 
-EventImplPtr exec_graph_impl::enqueueHostTaskPartition(
+sycl::detail::EventImplPtr exec_graph_impl::enqueueHostTaskPartition(
     std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
     sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) {
 
@@ -973,7 +1063,7 @@ EventImplPtr exec_graph_impl::enqueueHostTaskPartition(
   return nullptr;
 }
 
-EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler(
+sycl::detail::EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler(
     std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
     sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) {
 
@@ -1003,7 +1093,7 @@ EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler(
   return nullptr;
 }
 
-EventImplPtr exec_graph_impl::enqueuePartitionDirectly(
+sycl::detail::EventImplPtr exec_graph_impl::enqueuePartitionDirectly(
     std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
     std::vector<detail::EventImplPtr> &WaitEvents, bool EventNeeded) {
 
@@ -1043,7 +1133,7 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly(
   }
 }
 
-EventImplPtr
+sycl::detail::EventImplPtr
 exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
                                    sycl::detail::CG::StorageInitHelper &CGData,
                                    bool IsCGDataSafeForSchedulerBypass,
@@ -1157,7 +1247,7 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
   return SignalEvent;
 }
 
-EventImplPtr
+sycl::detail::EventImplPtr
 exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
                          sycl::detail::CG::StorageInitHelper CGData,
                          bool EventNeeded) {
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 8197ef477c744..751df62e35a2a 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -589,47 +589,29 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
   auto StreamID = detail::getActiveXPTIStreamID();
   TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
 #endif
+    printf("FOOBAR4 %d\n", MGraph.expired());
+
 
   if (!MGraph.expired()) {
     auto GraphImpl = MGraph.lock();
 
-    // Add a barrier node to the graph to create a partition point.
     // TODO: test if partitioned wait bits are set
     if (GraphImpl) {
-//#define PARTITIONED_WAIT_IMPL_BARRIERS 1
-#define PARTITIONED_WAIT_IMPL_EMPTY_NODE 1
-#if PARTITIONED_WAIT_IMPL_BARRIERS
-      std::vector<detail::EventImplPtr> EmptyWaitList;
-      auto BarrierCG = std::make_shared<detail::CGBarrier>(
-          std::move(EmptyWaitList),
-          ext::oneapi::experimental::event_mode_enum::none,
-          detail::CG::StorageInitHelper{},
-          detail::CGType::BarrierWaitlist,
-          CodeLoc
-      );
-
-      std::vector<ext::oneapi::experimental::detail::node_impl *> EmptyDeps;
-      ext::oneapi::experimental::detail::node_impl &BarrierNode = GraphImpl->add(
-          ext::oneapi::experimental::node_type::ext_oneapi_barrier,
-          std::static_pointer_cast<detail::CG>(BarrierCG),
-          EmptyDeps
-      );
 
-      GraphImpl->setBarrierDep(shared_from_this(), BarrierNode);
-#elif PARTITIONED_WAIT_IMPL_EMPTY_NODE
       auto EmptyCG = std::make_shared<detail::CG>(
           detail::CGType::None,
           detail::CG::StorageInitHelper{},
           CodeLoc
       );
 
+      printf("FOOBAR1\n");
+
       std::vector<ext::oneapi::experimental::detail::node_impl *> EmptyDeps;
-      ext::oneapi::experimental::detail::node_impl &EmptyNode = GraphImpl->add(
-          ext::oneapi::experimental::node_type::empty,
+      GraphImpl->add(
+          ext::oneapi::experimental::node_type::host_sync,
           EmptyCG,
           EmptyDeps
       );
-#endif
     }
     return;
   }

From f97b9c06e275860fc3f2f442eb21e2f88ecb459b Mon Sep 17 00:00:00 2001
From: Adam Fidel <adam.fidel@intel.com>
Date: Tue, 7 Oct 2025 08:52:56 -0700
Subject: [PATCH 6/6] Remove unneeded changes

---
 sycl/source/detail/graph/graph_impl.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp
index 75d9095648e58..fa1cf1a4800d2 100644
--- a/sycl/source/detail/graph/graph_impl.cpp
+++ b/sycl/source/detail/graph/graph_impl.cpp
@@ -1014,9 +1014,9 @@ exec_graph_impl::~exec_graph_impl() {
 
 // Clean up any execution events which have finished so we don't pass them
 // to the scheduler.
-static void cleanupExecutionEvents(std::vector<sycl::detail::EventImplPtr> &ExecutionEvents) {
+static void cleanupExecutionEvents(std::vector<EventImplPtr> &ExecutionEvents) {
 
-  auto Predicate = [](sycl::detail::EventImplPtr &EventPtr) {
+  auto Predicate = [](EventImplPtr &EventPtr) {
     return EventPtr->isCompleted();
   };
 
@@ -1025,7 +1025,7 @@ static void cleanupExecutionEvents(std::vector<sycl::detail::EventImplPtr> &Exec
       ExecutionEvents.end());
 }
 
-sycl::detail::EventImplPtr exec_graph_impl::enqueueHostTaskPartition(
+EventImplPtr exec_graph_impl::enqueueHostTaskPartition(
     std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
     sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) {
 
@@ -1063,7 +1063,7 @@ sycl::detail::EventImplPtr exec_graph_impl::enqueueHostTaskPartition(
   return nullptr;
 }
 
-sycl::detail::EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler(
+EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler(
     std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
     sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) {
 
@@ -1093,7 +1093,7 @@ sycl::detail::EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler(
   return nullptr;
 }
 
-sycl::detail::EventImplPtr exec_graph_impl::enqueuePartitionDirectly(
+EventImplPtr exec_graph_impl::enqueuePartitionDirectly(
     std::shared_ptr<partition> &Partition, sycl::detail::queue_impl &Queue,
     std::vector<detail::EventImplPtr> &WaitEvents, bool EventNeeded) {
 
@@ -1133,7 +1133,7 @@ sycl::detail::EventImplPtr exec_graph_impl::enqueuePartitionDirectly(
   }
 }
 
-sycl::detail::EventImplPtr
+EventImplPtr
 exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
                                    sycl::detail::CG::StorageInitHelper &CGData,
                                    bool IsCGDataSafeForSchedulerBypass,
@@ -1247,7 +1247,7 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
   return SignalEvent;
 }
 
-sycl::detail::EventImplPtr
+EventImplPtr
 exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
                          sycl::detail::CG::StorageInitHelper CGData,
                          bool EventNeeded) {