diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 1308be6c93a..64d940d44fb 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -38,7 +38,8 @@ Context::Context(vkapi::Adapter* adapter, const ContextConfig& config)
       querypool_(config_.query_pool_config, nullptr),
       // Command buffer submission
       cmd_mutex_{},
-      cmd_(VK_NULL_HANDLE, 0u),
+      cmd_(VK_NULL_HANDLE, VK_NULL_HANDLE, 0u),
+      prev_semaphore_(VK_NULL_HANDLE),
       submit_count_{0u},
       // Memory Management
       buffer_clearlist_mutex_{},
@@ -195,10 +196,21 @@ void Context::register_blit(
 }
 
 void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) {
+  // Wait semaphore would be previous command buffer's signal semaphore
+  VkSemaphore wait_semaphore = prev_semaphore_;
+  // Signal semaphore for the the current command buffer
+  VkSemaphore signal_semaphore = cmd_.get_signal_semaphore();
+  // Next command buffer would wait on this command buffer's signal semaphore
+  prev_semaphore_ = signal_semaphore;
+
   if (cmd_) {
     cmd_.end();
     adapter_p_->submit_cmd(
-        queue_, cmd_.get_submit_handle(final_use), fence_handle);
+        queue_,
+        cmd_.get_submit_handle(final_use),
+        fence_handle,
+        wait_semaphore,
+        signal_semaphore);
 
     submit_count_ = 0u;
   }
@@ -214,6 +226,8 @@ void Context::flush() {
   if (cmd_) {
     cmd_.invalidate();
   }
+  // Reset previous command buffer semaphore
+  prev_semaphore_ = VK_NULL_HANDLE;
 
   std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
   std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index e55ddcca141..9d8e7c92255 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -68,6 +68,8 @@ class Context final {
   // Command buffers submission
   std::mutex cmd_mutex_;
   vkapi::CommandBuffer cmd_;
+  // Semaphore for the previously submitted command buffer, if any
+  VkSemaphore prev_semaphore_;
   uint32_t submit_count_;
   // Memory Management
   std::mutex buffer_clearlist_mutex_;
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index e576dfae394..2a0b0d7db93 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -765,8 +765,6 @@ void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
   context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
   fence.wait();
   context_->fences().return_fence(fence);
-
-  context_->flush();
 }
 
 void ComputeGraph::prepack() {
@@ -787,7 +785,12 @@ void ComputeGraph::prepack() {
       // Otherwise, just submit the current command buffer for execution and
       // proceed. This results in lower load latency at the cost of higher peak
       // memory usage.
-      reduce_peak_memory ? submit_current_cmd_and_wait() : submit_current_cmd();
+      if (reduce_peak_memory) {
+        submit_current_cmd_and_wait();
+        context_->flush();
+      } else {
+        submit_current_cmd();
+      }
       staging_nbytes_in_cmd_ = 0;
       context_->set_cmd();
       submitted = true;
@@ -797,6 +800,7 @@ void ComputeGraph::prepack() {
     i++;
   }
   submit_current_cmd_and_wait(/*final_use=*/true);
+  context_->flush();
   staging_nbytes_in_cmd_ = 0;
 }
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 23cac658d50..1961f5046e2 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -847,8 +847,7 @@ class ComputeGraph final {
 
   /*
    * Submits the current command buffer in the Context to the GPU for execution,
-   * and wait for it to complete before returning. This function will also flush
-   * the Context after execution.
+   * and wait for it to complete before returning.
    */
   void submit_current_cmd_and_wait(const bool final_use = false);
 
diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
index 038a66159fb..e08491c656b 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ b/backends/vulkan/runtime/vk_api/Adapter.cpp
@@ -307,17 +307,22 @@ void Adapter::return_queue(Adapter::Queue& compute_queue) {
 void Adapter::submit_cmd(
     const Adapter::Queue& device_queue,
     VkCommandBuffer cmd,
-    VkFence fence) {
+    VkFence fence,
+    VkSemaphore wait_semaphore,
+    VkSemaphore signal_semaphore) {
+  const VkPipelineStageFlags flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+  const bool set_wait_semaphore = wait_semaphore != VK_NULL_HANDLE;
+  const bool set_signal_semaphore = signal_semaphore != VK_NULL_HANDLE;
   const VkSubmitInfo submit_info{
       VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
       nullptr, // pNext
-      0u, // waitSemaphoreCount
-      nullptr, // pWaitSemaphores
-      nullptr, // pWaitDstStageMask
+      set_wait_semaphore ? 1u : 0u, // waitSemaphoreCount
+      set_wait_semaphore ? &wait_semaphore : nullptr, // pWaitSemaphores
+      &flags, // pWaitDstStageMask
       1u, // commandBufferCount
       &cmd, // pCommandBuffers
-      0u, // signalSemaphoreCount
-      nullptr, // pSignalSemaphores
+      set_signal_semaphore ? 1u : 0u, // signalSemaphoreCount
+      set_signal_semaphore ? &signal_semaphore : nullptr, // pSignalSemaphores
   };
 
   std::lock_guard<std::mutex> queue_lock(
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
index d242e2d3ac1..aa4c659c6d8 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.h
+++ b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -242,8 +242,12 @@ class Adapter final {
 
   // Command Buffer Submission
 
-  void
-  submit_cmd(const Queue&, VkCommandBuffer, VkFence fence = VK_NULL_HANDLE);
+  void submit_cmd(
+      const Queue&,
+      VkCommandBuffer,
+      VkFence fence = VK_NULL_HANDLE,
+      VkSemaphore wait_semaphore = VK_NULL_HANDLE,
+      VkSemaphore signal_semaphore = VK_NULL_HANDLE);
 
   std::string stringize() const;
   friend std::ostream& operator<<(std::ostream&, const Adapter&);
diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp
index 3a5041f9500..4e0a915fe98 100644
--- a/backends/vulkan/runtime/vk_api/Command.cpp
+++ b/backends/vulkan/runtime/vk_api/Command.cpp
@@ -20,28 +20,34 @@ namespace vkapi {
 
 CommandBuffer::CommandBuffer(
     VkCommandBuffer handle,
+    VkSemaphore semaphore,
     const VkCommandBufferUsageFlags flags)
     : handle_(handle),
+      signal_semaphore_(semaphore),
       flags_(flags),
       state_(CommandBuffer::State::NEW),
       bound_{} {}
 
 CommandBuffer::CommandBuffer(CommandBuffer&& other) noexcept
     : handle_(other.handle_),
+      signal_semaphore_(other.signal_semaphore_),
       flags_(other.flags_),
-      state_(CommandBuffer::State::INVALID),
+      state_(other.state_),
       bound_(other.bound_) {
   other.handle_ = VK_NULL_HANDLE;
+  other.signal_semaphore_ = VK_NULL_HANDLE;
   other.bound_.reset();
 }
 
 CommandBuffer& CommandBuffer::operator=(CommandBuffer&& other) noexcept {
   handle_ = other.handle_;
+  signal_semaphore_ = other.signal_semaphore_;
   flags_ = other.flags_;
   state_ = other.state_;
   bound_ = other.bound_;
 
   other.handle_ = VK_NULL_HANDLE;
+  other.signal_semaphore_ = VK_NULL_HANDLE;
   other.bound_.reset();
   other.state_ = CommandBuffer::State::INVALID;
 
@@ -304,6 +310,12 @@ CommandPool::~CommandPool() {
   if (pool_ == VK_NULL_HANDLE) {
     return;
   }
+  for (auto& semaphore : semaphores_) {
+    if (semaphore != VK_NULL_HANDLE) {
+      vkDestroySemaphore(device_, semaphore, nullptr);
+    }
+  }
+
   vkDestroyCommandPool(device_, pool_, nullptr);
 }
 
@@ -314,6 +326,7 @@ CommandBuffer CommandPool::get_new_cmd(bool reusable) {
   allocate_new_batch(config_.cmd_pool_batch_size);
 
   VkCommandBuffer handle = buffers_[in_use_];
+  VkSemaphore semaphore = semaphores_[in_use_];
 
   VkCommandBufferUsageFlags cmd_flags = 0u;
   if (!reusable) {
@@ -321,7 +334,7 @@ CommandBuffer CommandPool::get_new_cmd(bool reusable) {
   }
 
   in_use_++;
-  return CommandBuffer(handle, cmd_flags);
+  return CommandBuffer(handle, semaphore, cmd_flags);
 }
 
 void CommandPool::flush() {
@@ -337,6 +350,7 @@ void CommandPool::allocate_new_batch(const uint32_t count) {
   }
 
   buffers_.resize(buffers_.size() + count);
+  semaphores_.resize(buffers_.size() + count);
 
   const VkCommandBufferAllocateInfo allocate_info{
       VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, // sType
@@ -348,6 +362,17 @@ void CommandPool::allocate_new_batch(const uint32_t count) {
 
   VK_CHECK(vkAllocateCommandBuffers(
       device_, &allocate_info, buffers_.data() + in_use_));
+
+  const VkSemaphoreCreateInfo semaphoreCreateInfo = {
+      VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, nullptr, 0};
+
+  for (uint32_t i = 0; i < count; i++) {
+    VK_CHECK(vkCreateSemaphore(
+        device_,
+        &semaphoreCreateInfo,
+        nullptr,
+        semaphores_.data() + in_use_ + i));
+  }
 }
 
 } // namespace vkapi
diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h
index ff1e5934a5c..d6d3fe05a34 100644
--- a/backends/vulkan/runtime/vk_api/Command.h
+++ b/backends/vulkan/runtime/vk_api/Command.h
@@ -26,7 +26,10 @@ namespace vkapi {
 
 class CommandBuffer final {
  public:
-  explicit CommandBuffer(VkCommandBuffer, const VkCommandBufferUsageFlags);
+  explicit CommandBuffer(
+      VkCommandBuffer,
+      VkSemaphore,
+      const VkCommandBufferUsageFlags);
 
   CommandBuffer(const CommandBuffer&) = delete;
   CommandBuffer& operator=(const CommandBuffer&) = delete;
@@ -70,6 +73,8 @@ class CommandBuffer final {
 
  private:
   VkCommandBuffer handle_;
+  // Semaphore to signal when the command buffer has completed execution
+  VkSemaphore signal_semaphore_;
   VkCommandBufferUsageFlags flags_;
   State state_;
   Bound bound_;
@@ -81,6 +86,7 @@ class CommandBuffer final {
 
   inline void invalidate() {
     handle_ = VK_NULL_HANDLE;
+    signal_semaphore_ = VK_NULL_HANDLE;
     bound_.reset();
   }
 
@@ -100,6 +106,10 @@ class CommandBuffer final {
 
   VkCommandBuffer get_submit_handle(const bool final_use = false);
 
+  VkSemaphore get_signal_semaphore() const {
+    return signal_semaphore_;
+  }
+
   inline operator bool() const {
     return handle_ != VK_NULL_HANDLE;
   }
@@ -130,6 +140,8 @@ class CommandPool final {
   // New Buffers
   std::mutex mutex_;
   std::vector<VkCommandBuffer> buffers_;
+  // Semaphores corresponding to the command buffers
+  std::vector<VkSemaphore> semaphores_;
   size_t in_use_;
 
  public: