diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp index 1308be6c93a..64d940d44fb 100644 --- a/backends/vulkan/runtime/api/Context.cpp +++ b/backends/vulkan/runtime/api/Context.cpp @@ -38,7 +38,8 @@ Context::Context(vkapi::Adapter* adapter, const ContextConfig& config) querypool_(config_.query_pool_config, nullptr), // Command buffer submission cmd_mutex_{}, - cmd_(VK_NULL_HANDLE, 0u), + cmd_(VK_NULL_HANDLE, VK_NULL_HANDLE, 0u), + prev_semaphore_(VK_NULL_HANDLE), submit_count_{0u}, // Memory Management buffer_clearlist_mutex_{}, @@ -195,10 +196,21 @@ void Context::register_blit( } void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) { + // Wait semaphore would be previous command buffer's signal semaphore + VkSemaphore wait_semaphore = prev_semaphore_; + // Signal semaphore for the the current command buffer + VkSemaphore signal_semaphore = cmd_.get_signal_semaphore(); + // Next command buffer would wait on this command buffer's signal semaphore + prev_semaphore_ = signal_semaphore; + if (cmd_) { cmd_.end(); adapter_p_->submit_cmd( - queue_, cmd_.get_submit_handle(final_use), fence_handle); + queue_, + cmd_.get_submit_handle(final_use), + fence_handle, + wait_semaphore, + signal_semaphore); submit_count_ = 0u; } @@ -214,6 +226,8 @@ void Context::flush() { if (cmd_) { cmd_.invalidate(); } + // Reset previous command buffer semaphore + prev_semaphore_ = VK_NULL_HANDLE; std::lock_guard bufferlist_lock(buffer_clearlist_mutex_); std::lock_guard imagelist_lock(image_clearlist_mutex_); diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h index e55ddcca141..9d8e7c92255 100644 --- a/backends/vulkan/runtime/api/Context.h +++ b/backends/vulkan/runtime/api/Context.h @@ -68,6 +68,8 @@ class Context final { // Command buffers submission std::mutex cmd_mutex_; vkapi::CommandBuffer cmd_; + // Semaphore for the previously submitted command buffer, if any + VkSemaphore prev_semaphore_; uint32_t submit_count_; // Memory Management std::mutex buffer_clearlist_mutex_; diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index e576dfae394..2a0b0d7db93 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -765,8 +765,6 @@ void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) { context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use); fence.wait(); context_->fences().return_fence(fence); - - context_->flush(); } void ComputeGraph::prepack() { @@ -787,7 +785,12 @@ void ComputeGraph::prepack() { // Otherwise, just submit the current command buffer for execution and // proceed. This results in lower load latency at the cost of higher peak // memory usage. - reduce_peak_memory ? submit_current_cmd_and_wait() : submit_current_cmd(); + if (reduce_peak_memory) { + submit_current_cmd_and_wait(); + context_->flush(); + } else { + submit_current_cmd(); + } staging_nbytes_in_cmd_ = 0; context_->set_cmd(); submitted = true; @@ -797,6 +800,7 @@ void ComputeGraph::prepack() { i++; } submit_current_cmd_and_wait(/*final_use=*/true); + context_->flush(); staging_nbytes_in_cmd_ = 0; } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 23cac658d50..1961f5046e2 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -847,8 +847,7 @@ class ComputeGraph final { /* * Submits the current command buffer in the Context to the GPU for execution, - * and wait for it to complete before returning. This function will also flush - * the Context after execution. + * and wait for it to complete before returning. */ void submit_current_cmd_and_wait(const bool final_use = false); diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp index 038a66159fb..e08491c656b 100644 --- a/backends/vulkan/runtime/vk_api/Adapter.cpp +++ b/backends/vulkan/runtime/vk_api/Adapter.cpp @@ -307,17 +307,22 @@ void Adapter::return_queue(Adapter::Queue& compute_queue) { void Adapter::submit_cmd( const Adapter::Queue& device_queue, VkCommandBuffer cmd, - VkFence fence) { + VkFence fence, + VkSemaphore wait_semaphore, + VkSemaphore signal_semaphore) { + const VkPipelineStageFlags flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + const bool set_wait_semaphore = wait_semaphore != VK_NULL_HANDLE; + const bool set_signal_semaphore = signal_semaphore != VK_NULL_HANDLE; const VkSubmitInfo submit_info{ VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType nullptr, // pNext - 0u, // waitSemaphoreCount - nullptr, // pWaitSemaphores - nullptr, // pWaitDstStageMask + set_wait_semaphore ? 1u : 0u, // waitSemaphoreCount + set_wait_semaphore ? &wait_semaphore : nullptr, // pWaitSemaphores + &flags, // pWaitDstStageMask 1u, // commandBufferCount &cmd, // pCommandBuffers - 0u, // signalSemaphoreCount - nullptr, // pSignalSemaphores + set_signal_semaphore ? 1u : 0u, // signalSemaphoreCount + set_signal_semaphore ? &signal_semaphore : nullptr, // pSignalSemaphores }; std::lock_guard queue_lock( diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h index d242e2d3ac1..aa4c659c6d8 100644 --- a/backends/vulkan/runtime/vk_api/Adapter.h +++ b/backends/vulkan/runtime/vk_api/Adapter.h @@ -242,8 +242,12 @@ class Adapter final { // Command Buffer Submission - void - submit_cmd(const Queue&, VkCommandBuffer, VkFence fence = VK_NULL_HANDLE); + void submit_cmd( + const Queue&, + VkCommandBuffer, + VkFence fence = VK_NULL_HANDLE, + VkSemaphore wait_semaphore = VK_NULL_HANDLE, + VkSemaphore signal_semaphore = VK_NULL_HANDLE); std::string stringize() const; friend std::ostream& operator<<(std::ostream&, const Adapter&); diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp index 3a5041f9500..4e0a915fe98 100644 --- a/backends/vulkan/runtime/vk_api/Command.cpp +++ b/backends/vulkan/runtime/vk_api/Command.cpp @@ -20,28 +20,34 @@ namespace vkapi { CommandBuffer::CommandBuffer( VkCommandBuffer handle, + VkSemaphore semaphore, const VkCommandBufferUsageFlags flags) : handle_(handle), + signal_semaphore_(semaphore), flags_(flags), state_(CommandBuffer::State::NEW), bound_{} {} CommandBuffer::CommandBuffer(CommandBuffer&& other) noexcept : handle_(other.handle_), + signal_semaphore_(other.signal_semaphore_), flags_(other.flags_), - state_(CommandBuffer::State::INVALID), + state_(other.state_), bound_(other.bound_) { other.handle_ = VK_NULL_HANDLE; + other.signal_semaphore_ = VK_NULL_HANDLE; other.bound_.reset(); } CommandBuffer& CommandBuffer::operator=(CommandBuffer&& other) noexcept { handle_ = other.handle_; + signal_semaphore_ = other.signal_semaphore_; flags_ = other.flags_; state_ = other.state_; bound_ = other.bound_; other.handle_ = VK_NULL_HANDLE; + other.signal_semaphore_ = VK_NULL_HANDLE; other.bound_.reset(); other.state_ = CommandBuffer::State::INVALID; @@ -304,6 +310,12 @@ CommandPool::~CommandPool() { if (pool_ == VK_NULL_HANDLE) { return; } + for (auto& semaphore : semaphores_) { + if (semaphore != VK_NULL_HANDLE) { + vkDestroySemaphore(device_, semaphore, nullptr); + } + } + vkDestroyCommandPool(device_, pool_, nullptr); } @@ -314,6 +326,7 @@ CommandBuffer CommandPool::get_new_cmd(bool reusable) { allocate_new_batch(config_.cmd_pool_batch_size); VkCommandBuffer handle = buffers_[in_use_]; + VkSemaphore semaphore = semaphores_[in_use_]; VkCommandBufferUsageFlags cmd_flags = 0u; if (!reusable) { @@ -321,7 +334,7 @@ CommandBuffer CommandPool::get_new_cmd(bool reusable) { } in_use_++; - return CommandBuffer(handle, cmd_flags); + return CommandBuffer(handle, semaphore, cmd_flags); } void CommandPool::flush() { @@ -337,6 +350,7 @@ void CommandPool::allocate_new_batch(const uint32_t count) { } buffers_.resize(buffers_.size() + count); + semaphores_.resize(buffers_.size() + count); const VkCommandBufferAllocateInfo allocate_info{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, // sType @@ -348,6 +362,17 @@ void CommandPool::allocate_new_batch(const uint32_t count) { VK_CHECK(vkAllocateCommandBuffers( device_, &allocate_info, buffers_.data() + in_use_)); + + const VkSemaphoreCreateInfo semaphoreCreateInfo = { + VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, nullptr, 0}; + + for (uint32_t i = 0; i < count; i++) { + VK_CHECK(vkCreateSemaphore( + device_, + &semaphoreCreateInfo, + nullptr, + semaphores_.data() + in_use_ + i)); + } } } // namespace vkapi diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h index ff1e5934a5c..d6d3fe05a34 100644 --- a/backends/vulkan/runtime/vk_api/Command.h +++ b/backends/vulkan/runtime/vk_api/Command.h @@ -26,7 +26,10 @@ namespace vkapi { class CommandBuffer final { public: - explicit CommandBuffer(VkCommandBuffer, const VkCommandBufferUsageFlags); + explicit CommandBuffer( + VkCommandBuffer, + VkSemaphore, + const VkCommandBufferUsageFlags); CommandBuffer(const CommandBuffer&) = delete; CommandBuffer& operator=(const CommandBuffer&) = delete; @@ -70,6 +73,8 @@ class CommandBuffer final { private: VkCommandBuffer handle_; + // Semaphore to signal when the command buffer has completed execution + VkSemaphore signal_semaphore_; VkCommandBufferUsageFlags flags_; State state_; Bound bound_; @@ -81,6 +86,7 @@ class CommandBuffer final { inline void invalidate() { handle_ = VK_NULL_HANDLE; + signal_semaphore_ = VK_NULL_HANDLE; bound_.reset(); } @@ -100,6 +106,10 @@ class CommandBuffer final { VkCommandBuffer get_submit_handle(const bool final_use = false); + VkSemaphore get_signal_semaphore() const { + return signal_semaphore_; + } + inline operator bool() const { return handle_ != VK_NULL_HANDLE; } @@ -130,6 +140,8 @@ class CommandPool final { // New Buffers std::mutex mutex_; std::vector buffers_; + // Semaphores corresponding to the command buffers + std::vector semaphores_; size_t in_use_; public: