diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 4dc6bb7c13655..0ada5a56ba3a7 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -977,11 +977,6 @@ exec_graph_impl::~exec_graph_impl() { const sycl::detail::AdapterPtr &Adapter = sycl::detail::getSyclObjImpl(MContext)->getAdapter(); MSchedule.clear(); - // We need to wait on all command buffer executions before we can release - // them. - for (auto &Event : MExecutionEvents) { - Event->wait(Event); - } // Clean up any graph-owned allocations that were allocated MGraphImpl->getMemPool().deallocateAndUnmapAll(); diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 6e12509db9cdf..d79af30a65daa 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -10953,7 +10953,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while `hCommandBuffer` is still +/// executing, which will block on completion if the reference count of +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst index 136c7024632ec..2b443c13f6a3d 100644 --- a/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/unified-runtime/scripts/core/EXP-COMMAND-BUFFER.rst @@ -563,3 +563,4 @@ Contributors * Maxime France-Pillois `maxime.francepillois@codeplay.com `_ * Aaron Greig `aaron.greig@codeplay.com `_ * Fábio Mestre `fabio.mestre@codeplay.com `_ +* Konrad Kusiak `konrad.kusiak@codeplay.com `_ diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index 05e61985cd4b8..e8f2caa15d59d 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -310,7 +310,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero." +desc: "Decrement the command-buffer object's reference count and delete the command-buffer object if the reference count becomes zero. It is legal to call the entry-point while `hCommandBuffer` is still executing, which will block on completion if the reference count of `hCommandBuffer` becomes zero." class: $xCommandBuffer name: ReleaseExp params: diff --git a/unified-runtime/source/adapters/cuda/command_buffer.cpp b/unified-runtime/source/adapters/cuda/command_buffer.cpp index db64fec7c53c6..57dde21d20353 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.cpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.cpp @@ -388,6 +388,9 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); + } // Ref count has reached zero, release of created commands for (auto &Command : hCommandBuffer->CommandHandles) { commandHandleDestroy(Command); @@ -1162,19 +1165,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList)); - if (phEvent) { - RetImplEvent = std::unique_ptr( - ur_event_handle_t_::makeNative(UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, - hQueue, CuStream, StreamToken)); - UR_CHECK_ERROR(RetImplEvent->start()); - } + RetImplEvent = std::unique_ptr( + ur_event_handle_t_::makeNative(UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, + hQueue, CuStream, StreamToken)); + UR_CHECK_ERROR(RetImplEvent->start()); // Launch graph UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream)); + UR_CHECK_ERROR(RetImplEvent->record()); + hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); + *phEvent = hCommandBuffer->CurrentExecution; } return UR_RESULT_SUCCESS; } catch (ur_result_t Err) { diff --git a/unified-runtime/source/adapters/cuda/command_buffer.hpp b/unified-runtime/source/adapters/cuda/command_buffer.hpp index e11b9ab74969a..d58cd4f87cda8 100644 --- a/unified-runtime/source/adapters/cuda/command_buffer.hpp +++ b/unified-runtime/source/adapters/cuda/command_buffer.hpp @@ -192,6 +192,8 @@ struct ur_exp_command_buffer_handle_t_ : ur::cuda::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; + // Track the event of the current graph execution. + ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events, so that we can find the last // node added to an in-order command-buffer. diff --git a/unified-runtime/source/adapters/hip/command_buffer.cpp b/unified-runtime/source/adapters/hip/command_buffer.cpp index 788bc51b2e468..b169a8ff4e5b4 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.cpp +++ b/unified-runtime/source/adapters/hip/command_buffer.cpp @@ -273,6 +273,9 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->CurrentExecution) { + UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait()); + } delete hCommandBuffer; } return UR_RESULT_SUCCESS; @@ -798,19 +801,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp( UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList)); - if (phEvent) { - RetImplEvent = std::unique_ptr( - ur_event_handle_t_::makeNative(UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, - hQueue, HIPStream, StreamToken)); - UR_CHECK_ERROR(RetImplEvent->start()); - } + RetImplEvent = std::unique_ptr( + ur_event_handle_t_::makeNative(UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, + hQueue, HIPStream, StreamToken)); + UR_CHECK_ERROR(RetImplEvent->start()); // Launch graph UR_CHECK_ERROR(hipGraphLaunch(hCommandBuffer->HIPGraphExec, HIPStream)); + UR_CHECK_ERROR(RetImplEvent->record()); + hCommandBuffer->CurrentExecution = RetImplEvent.release(); if (phEvent) { - UR_CHECK_ERROR(RetImplEvent->record()); - *phEvent = RetImplEvent.release(); + *phEvent = hCommandBuffer->CurrentExecution; } } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/hip/command_buffer.hpp b/unified-runtime/source/adapters/hip/command_buffer.hpp index 3d0047adee013..3dd1417fef427 100644 --- a/unified-runtime/source/adapters/hip/command_buffer.hpp +++ b/unified-runtime/source/adapters/hip/command_buffer.hpp @@ -128,6 +128,8 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base { // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; + // Track the event of the current graph execution. + ur_event_handle_t CurrentExecution = nullptr; // Ordered map of sync_points to ur_events std::map SyncPoints; diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 5cb85d2059e65..1924a7c2aae34 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -671,6 +671,25 @@ ur_result_t createMainCommandList(ur_context_handle_t Context, return UR_RESULT_SUCCESS; } +/** + * Waits for any ongoing executions of the command-buffer to finish. + * @param CommandBuffer The command-buffer to wait for. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { + + if (ur_event_handle_t &CurrentSubmissionEvent = + CommandBuffer->CurrentSubmissionEvent) { + ZE2UR_CALL(zeEventHostSynchronize, + (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); + UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); + CurrentSubmissionEvent = nullptr; + } + + return UR_RESULT_SUCCESS; +} + /** * Checks whether the command-buffer can be constructed using in order * command-lists. @@ -830,6 +849,7 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { if (!CommandBuffer->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; + UR_CALL(waitForOngoingExecution(CommandBuffer)); CommandBuffer->cleanupCommandBufferResources(); delete CommandBuffer; return UR_RESULT_SUCCESS; @@ -1442,25 +1462,6 @@ ur_result_t getZeCommandQueue(ur_queue_handle_t Queue, bool UseCopyEngine, return UR_RESULT_SUCCESS; } -/** - * Waits for any ongoing executions of the command-buffer to finish. - * @param CommandBuffer The command-buffer to wait for. - * @return UR_RESULT_SUCCESS or an error code on failure - */ -ur_result_t -waitForOngoingExecution(ur_exp_command_buffer_handle_t CommandBuffer) { - - if (ur_event_handle_t &CurrentSubmissionEvent = - CommandBuffer->CurrentSubmissionEvent) { - ZE2UR_CALL(zeEventHostSynchronize, - (CurrentSubmissionEvent->ZeEvent, UINT64_MAX)); - UR_CALL(urEventReleaseInternal(CurrentSubmissionEvent)); - CurrentSubmissionEvent = nullptr; - } - - return UR_RESULT_SUCCESS; -} - /** * Waits for the all the dependencies of the command-buffer * @param[in] CommandBuffer The command-buffer. diff --git a/unified-runtime/source/adapters/opencl/command_buffer.cpp b/unified-runtime/source/adapters/opencl/command_buffer.cpp index e048b2d22175c..affc5b5882870 100644 --- a/unified-runtime/source/adapters/opencl/command_buffer.cpp +++ b/unified-runtime/source/adapters/opencl/command_buffer.cpp @@ -115,6 +115,10 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { if (hCommandBuffer->decrementReferenceCount() == 0) { + if (hCommandBuffer->LastSubmission) { + cl_int RetErr = clWaitForEvents(1, &(hCommandBuffer->LastSubmission)); + CL_RETURN_ON_FAILURE(RetErr); + } delete hCommandBuffer; } diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 21ce6a6383d9e..679f7c3a4677b 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -8449,7 +8449,10 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while `hCommandBuffer` is still +/// executing, which will block on completion if the reference count of +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index c2727d14cc166..5b404fc5d88f4 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -7358,7 +7358,10 @@ ur_result_t UR_APICALL urCommandBufferRetainExp( /////////////////////////////////////////////////////////////////////////////// /// @brief Decrement the command-buffer object's reference count and delete the -/// command-buffer object if the reference count becomes zero. +/// command-buffer object if the reference count becomes zero. It is +/// legal to call the entry-point while `hCommandBuffer` is still +/// executing, which will block on completion if the reference count of +/// `hCommandBuffer` becomes zero. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp index a2c140bde4aab..ccd59dacd8ad9 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/enqueue.cpp @@ -135,3 +135,13 @@ TEST_P(urEnqueueCommandBufferExpTest, SerializeOutofOrderQueue) { ASSERT_EQ(reference, Output[i]); } } + +// Tests releasing command-buffer while it is still executing relying +// on synchronization during urCommandBufferReleaseExp call. +TEST_P(urEnqueueCommandBufferExpTest, EnqueueAndRelease) { + ASSERT_SUCCESS(urEnqueueCommandBufferExp(out_of_order_queue, cmd_buf_handle, + 0, nullptr, nullptr)); + + // Release the command buffer + ASSERT_SUCCESS(urCommandBufferReleaseExp(cmd_buf_handle)); +}