From 8e70e5c1aba1e24fbe3abdd89a872e70077fafbd Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Wed, 21 May 2025 11:50:56 +0100 Subject: [PATCH 1/5] Modified the adapters such that it is valid to call release on CB while it is executing --- sycl/source/detail/graph_impl.cpp | 5 --- .../scripts/core/EXP-COMMAND-BUFFER.rst | 1 + .../scripts/core/exp-command-buffer.yml | 2 +- .../source/adapters/cuda/command_buffer.cpp | 4 ++ .../source/adapters/cuda/command_buffer.hpp | 2 + .../source/adapters/hip/command_buffer.cpp | 4 ++ .../source/adapters/hip/command_buffer.hpp | 2 + .../adapters/level_zero/command_buffer.cpp | 39 ++++++++++--------- .../source/adapters/opencl/command_buffer.cpp | 4 ++ .../exp_command_buffer/enqueue.cpp | 10 +++++ 10 files changed, 48 insertions(+), 25 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 4dc6bb7c13655..0ada5a56ba3a7 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -977,11 +977,6 @@ exec_graph_impl::~exec_graph_impl() { const sycl::detail::AdapterPtr &Adapter = sycl::detail::getSyclObjImpl(MContext)->getAdapter(); MSchedule.clear(); - // We need to wait on all command buffer executions before we can release - // them. - for (auto &Event : MExecutionEvents) { - Event->wait(Event); - } // Clean up any graph-owned allocations that were allocated MGraphImpl->getMemPool().deallocateAndUnmapAll(); diff --git a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst index 136c7024632ec..2b443c13f6a3d 100644 --- a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst @@ -563,3 +563,4 @@ Contributors * Maxime France-Pillois `maxime.francepillois@codeplay.com `_ * Aaron Greig `aaron.greig@codeplay.com `_ * Fábio Mestre `fabio.mestre@codeplay.com `_ +* Konrad Kusiak `konrad.kusiak@codeplay.com `_ diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index 05e61985cd4b8..d38c05894d13f 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -310,7 +310,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero." +desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It will try synchronizing the command-buffer, hence it is legal to call it while command-buffer is still executing." class: $xCommandBuffer name: ReleaseExp params: diff --git a/unified-runtime/source/adapters/cuda/command_buffer.cpp b/unified-runtime/source/adapters/cuda/command_buffer.cpp index db64fec7c53c6..f861e39591832 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.cpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.cpp @@ -388,6 +388,9 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); + } // Ref count has reached zero, release of created commands for (auto &Command : hCommandBuffer->CommandHandles) { commandHandleDestroy(Command); @@ -1175,6 +1178,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); + hCommandBuffer->CurrentExecution = *phEvent; } return UR_RESULT_SUCCESS; } catch (ur_result_t Err) { diff --git a/unified-runtime/source/adapters/cuda/command_buffer.hpp b/unified-runtime/source/adapters/cuda/command_buffer.hpp index e11b9ab74969a..b7a3509ea72da 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.hpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.hpp @@ -192,6 +192,8 @@ struct ur_exp_command_buffer_handle_t_ : ur::cuda::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; + // The event of current graph execution. + ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events, so that we can find the last // node added to an in-order command-buffer. diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index 788bc51b2e468..6c18295abf063 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -273,6 +273,9 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); + } delete hCommandBuffer; } return UR_RESULT_SUCCESS; @@ -811,6 +814,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); + hCommandBuffer->CurrentExecution = *phEvent; } } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp index 3d0047adee013..7b6da91feaafe 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.hpp +++ b/unified-runtime/source/adapters/hip/command_buffer.hpp @@ -128,6 +128,8 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; + // The event of current graph execution. + ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events std::map SyncPoints; diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 5cb85d2059e65..1924a7c2aae34 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -671,6 +671,25 @@ ur_result_t createMainCommandList(ur_context_handle_t Context, return UR_RESULT_SUCCESS; } +/** + * Waits for any ongoing executions of the command-buffer to finish. + * @param CommandBuffer The command-buffer to wait for. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { + + if (ur_event_handle_t &CurrentSubmissionEvent = + CommandBuffer->CurrentSubmissionEvent) { + ZE2UR_CALL(zeEventHostSynchronize, + (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); + UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); + CurrentSubmissionEvent = nullptr; + } + + return UR_RESULT_SUCCESS; +} + /** * Checks whether the command-buffer can be constructed using in order * command-lists. @@ -830,6 +849,7 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { if (!CommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; + UR_CALL(waitForOngoingExecution(CommandBuffer)); CommandBuffer->cleanupCommandBufferResources(); delete CommandBuffer; return UR_RESULT_SUCCESS; @@ -1442,25 +1462,6 @@ ur_result_t getZeCommandQueue(ur_queue_handle_t Queue, bool UseCopyEngine, return UR_RESULT_SUCCESS; } -/** - * Waits for any ongoing executions of the command-buffer to finish. - * @param CommandBuffer The command-buffer to wait for. - * @return UR_RESULT_SUCCESS or an error code on failure - */ -ur_result_t -waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { - - if (ur_event_handle_t &CurrentSubmissionEvent = - CommandBuffer->CurrentSubmissionEvent) { - ZE2UR_CALL(zeEventHostSynchronize, - (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); - UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); - CurrentSubmissionEvent = nullptr; - } - - return UR_RESULT_SUCCESS; -} - /** * Waits for the all the dependencies of the command-buffer * @param[in] CommandBuffer The command-buffer. diff --git a/unified-runtime/source/adapters/opencl/command_buffer.cpp b/unified-runtime/source/adapters/opencl/command_buffer.cpp index e048b2d22175c..affc5b5882870 100644 --- a/unified-runtime/source/adapters/opencl/command_buffer.cpp +++ b/unified-runtime/source/adapters/opencl/command_buffer.cpp @@ -115,6 +115,10 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->LastSubmission) { + cl_int RetErr = clWaitForEvents(1, &(hCommandBuffer->LastSubmission)); + CL_RETURN_ON_FAILURE(RetErr); + } delete hCommandBuffer; } diff --git a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp index eedd95d5c040b..b917361324174 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp @@ -132,3 +132,13 @@ TEST_P(urEnqueueCommandBufferExpTest, SerializeOutofOrderQueue) { ASSERT_EQ(reference, Output[i]); } } + +// Tests releasing command-buffer while it is still executing relying +// on synchronization during urCommandBufferReleaseExp call. +TEST_P(urEnqueueCommandBufferExpTest, EnqueueAndRelease) { + ASSERT_SUCCESS(urEnqueueCommandBufferExp(out_of_order_queue, cmd_buf_handle, + 0, nullptr, nullptr)); + + // Release the command buffer + ASSERT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); +} From 185588691d2828716a132c5b93acc6928a850353 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Mon, 26 May 2025 13:48:03 +0100 Subject: [PATCH 2/5] Modified spec wording and made creating an event branchless on CUDA and HIP --- unified-runtime/include/ur_api.h | 5 ++++- .../scripts/core/exp-command-buffer.yml | 2 +- .../source/adapters/cuda/command_buffer.cpp | 16 +++++++--------- .../source/adapters/cuda/command_buffer.hpp | 2 +- .../source/adapters/hip/command_buffer.cpp | 16 +++++++--------- .../source/adapters/hip/command_buffer.hpp | 2 +- unified-runtime/source/loader/ur_libapi.cpp | 5 ++++- unified-runtime/source/ur_api.cpp | 5 ++++- 8 files changed, 29 insertions(+), 24 deletions(-) diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 8f840ccca112d..ae14fafeeff32 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -10952,7 +10952,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while hCommandBuffer is still +/// executing, which will block on completion if the reference count of +/// hCommandBuffer becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index d38c05894d13f..85e584dc4839f 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -310,7 +310,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It will try synchronizing the command-buffer, hence it is legal to call it while command-buffer is still executing." +desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It is legal to call the entry-point while hCommandBuffer is still executing, which will block on completion if the reference count of hCommandBuffer becomes zero." class: $xCommandBuffer name: ReleaseExp params: diff --git a/unified-runtime/source/adapters/cuda/command_buffer.cpp b/unified-runtime/source/adapters/cuda/command_buffer.cpp index f861e39591832..57dde21d20353 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.cpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.cpp @@ -1165,20 +1165,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList)); - if (phEvent) { - RetImplEvent = std::unique_ptr( - ur_event_handle_t_::makeNative(UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, - hQueue, CuStream, StreamToken)); - UR_CHECK_ERROR(RetImplEvent->start()); - } + RetImplEvent = std::unique_ptr( + ur_event_handle_t_::makeNative(UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, + hQueue, CuStream, StreamToken)); + UR_CHECK_ERROR(RetImplEvent->start()); // Launch graph UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream)); + UR_CHECK_ERROR(RetImplEvent->record()); + hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - hCommandBuffer->CurrentExecution = *phEvent; + *phEvent = hCommandBuffer->CurrentExecution; } return UR_RESULT_SUCCESS; } catch (ur_result_t Err) { diff --git a/unified-runtime/source/adapters/cuda/command_buffer.hpp b/unified-runtime/source/adapters/cuda/command_buffer.hpp index b7a3509ea72da..d58cd4f87cda8 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.hpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.hpp @@ -192,7 +192,7 @@ struct ur_exp_command_buffer_handle_t_ : ur::cuda::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; - // The event of current graph execution. + // Track the event of the current graph execution. ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events, so that we can find the last diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index 6c18295abf063..b169a8ff4e5b4 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -801,20 +801,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList)); - if (phEvent) { - RetImplEvent = std::unique_ptr( - ur_event_handle_t_::makeNative(UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, - hQueue, HIPStream, StreamToken)); - UR_CHECK_ERROR(RetImplEvent->start()); - } + RetImplEvent = std::unique_ptr( + ur_event_handle_t_::makeNative(UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, + hQueue, HIPStream, StreamToken)); + UR_CHECK_ERROR(RetImplEvent->start()); // Launch graph UR_CHECK_ERROR(hipGraphLaunch(hCommandBuffer->HIPGraphExec, HIPStream)); + UR_CHECK_ERROR(RetImplEvent->record()); + hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); - hCommandBuffer->CurrentExecution = *phEvent; + *phEvent = hCommandBuffer->CurrentExecution; } } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp index 7b6da91feaafe..3dd1417fef427 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.hpp +++ b/unified-runtime/source/adapters/hip/command_buffer.hpp @@ -128,7 +128,7 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; - // The event of current graph execution. + // Track the event of the current graph execution. ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index a7118181a5ea8..1a0ffcb2eb407 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -8448,7 +8448,10 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while hCommandBuffer is still +/// executing, which will block on completion if the reference count of +/// hCommandBuffer becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index b5f2d422eb15a..a93dc02dd6206 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -7357,7 +7357,10 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while hCommandBuffer is still +/// executing, which will block on completion if the reference count of +/// hCommandBuffer becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS From 44010d0c67bdd0b5b1c046483734aa2233233528 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Mon, 26 May 2025 15:34:20 +0100 Subject: [PATCH 3/5] Added backticks --- unified-runtime/include/ur_api.h | 4 ++-- unified-runtime/scripts/core/exp-command-buffer.yml | 2 +- unified-runtime/source/loader/ur_libapi.cpp | 4 ++-- unified-runtime/source/ur_api.cpp | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 44982f1117c41..d79af30a65daa 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -10954,9 +10954,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the /// command-buffer object if the reference count becomes zero. It is -/// legal to call the entry-point while hCommandBuffer is still +/// legal to call the entry-point while `hCommandBuffer` is still /// executing, which will block on completion if the reference count of -/// hCommandBuffer becomes zero. +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index 85e584dc4839f..e8f2caa15d59d 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -310,7 +310,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It is legal to call the entry-point while hCommandBuffer is still executing, which will block on completion if the reference count of hCommandBuffer becomes zero." +desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It is legal to call the entry-point while `hCommandBuffer` is still executing, which will block on completion if the reference count of `hCommandBuffer` becomes zero." class: $xCommandBuffer name: ReleaseExp params: diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index d2ed98e1d0275..679f7c3a4677b 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -8450,9 +8450,9 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the /// command-buffer object if the reference count becomes zero. It is -/// legal to call the entry-point while hCommandBuffer is still +/// legal to call the entry-point while `hCommandBuffer` is still /// executing, which will block on completion if the reference count of -/// hCommandBuffer becomes zero. +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index fc899ecadd538..5b404fc5d88f4 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -7359,9 +7359,9 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the /// command-buffer object if the reference count becomes zero. It is -/// legal to call the entry-point while hCommandBuffer is still +/// legal to call the entry-point while `hCommandBuffer` is still /// executing, which will block on completion if the reference count of -/// hCommandBuffer becomes zero. +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS From b8d166627313970bb754d58756ba03bdc71222ec Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Tue, 27 May 2025 16:31:54 +0100 Subject: [PATCH 4/5] Changed to EXPECT_SUCCESS and added queue wait before exiting --- .../test/conformance/exp_command_buffer/enqueue.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp index ccd59dacd8ad9..91ce1381871db 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp @@ -142,6 +142,9 @@ TEST_P(urEnqueueCommandBufferExpTest, EnqueueAndRelease) { ASSERT_SUCCESS(urEnqueueCommandBufferExp(out_of_order_queue, cmd_buf_handle, 0, nullptr, nullptr)); - // Release the command buffer - ASSERT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); + // Release the command buffer without explicitly waiting beforehand + EXPECT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); + + // Wait before exiting + ASSERT_SUCCESS(urQueueFinish(out_of_order_queue)); } From f2e1f92d88269de1626009d4b80e57ce194750dd Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Wed, 28 May 2025 23:42:57 +0100 Subject: [PATCH 5/5] Brought back additional fence for wait event path --- .../adapters/level_zero/command_buffer.cpp | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 1924a7c2aae34..25c2d8db2bcff 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -677,7 +677,7 @@ ur_result_t createMainCommandList(ur_context_handle_t Context, * @return UR_RESULT_SUCCESS or an error code on failure */ ur_result_t -waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { +waitForLastSubmission(ur_exp_command_buffer_handle_t CommandBuffer) { if (ur_event_handle_t &CurrentSubmissionEvent = CommandBuffer->CurrentSubmissionEvent) { @@ -690,6 +690,30 @@ waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { return UR_RESULT_SUCCESS; } +/** + * Waits for any ongoing executions of the command-buffer to finish + * but put fence in case of wait event path. + * @param CommandBuffer The command-buffer to wait for. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { + + if (CommandBuffer->UseImmediateAppendPath) { + if (ur_event_handle_t &CurrentSubmissionEvent = + CommandBuffer->CurrentSubmissionEvent) { + ZE2UR_CALL(zeEventHostSynchronize, + (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); + UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); + CurrentSubmissionEvent = nullptr; + } + } else if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) { + ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + /** * Checks whether the command-buffer can be constructed using in order * command-lists. @@ -1732,7 +1756,7 @@ ur_result_t urEnqueueCommandBufferExp( std::scoped_lock Lock(UrQueue->Mutex); - UR_CALL(waitForOngoingExecution(CommandBuffer)); + UR_CALL(waitForLastSubmission(CommandBuffer)); const bool IsInternal = (Event == nullptr); const bool DoProfiling =