[Store|TransferEngine]: use condition-variable based completion instead of busy-polling (#1053)

wwq2333 · wangwenqi · web-flow · commit 496cecdd7cc3 · 2025-11-17T17:36:08.000+08:00
* (Event-driven completion) Provide an option to use condition-variable based completion instead of busy-polling

Summary
- Add USE_EVENT_DRIVEN_COMPLETION compile-time option in common.cmake (default OFF).
    - Store: TransferEngineOperationState waits on BatchDesc::completion_cv with timeout; falls back to original polling when the flag is OFF.
    - TransferEngine: add per-batch finished_task_count, has_failure, is_finished and completion_cv on BatchDesc; notify on last task completion; unify __atomic_* usage.
- No behavior change when USE_EVENT_DRIVEN_COMPLETION is OFF (default).



---------

Co-authored-by: wangwenqi &lt;wenqi.wangwwq@shopee.com&gt;
diff --git a/mooncake-common/common.cmake b/mooncake-common/common.cmake
@@ -74,6 +74,7 @@ option(WITH_RUST_EXAMPLE "build the Rust interface and sample code for the trans
 option(WITH_METRICS "enable metrics and metrics reporting thread" ON)
 option(USE_3FS "option for using 3FS storage backend" OFF)
 option(WITH_NVIDIA_PEERMEM "disable to support RDMA without nvidia-peermem. If WITH_NVIDIA_PEERMEM=OFF then USE_CUDA=ON is required." ON)
+option(USE_EVENT_DRIVEN_COMPLETION "option for using event-driven completion (store & transfer engine)" OFF)
 
 option(USE_LRU_MASTER "option for using LRU in master service" OFF)
 set(LRU_MAX_CAPACITY 1000)
@@ -83,6 +84,12 @@ if (USE_LRU_MASTER)
   add_compile_definitions(LRU_MAX_CAPACITY)
 endif()
 
+if (USE_EVENT_DRIVEN_COMPLETION)
+  add_compile_definitions(USE_EVENT_DRIVEN_COMPLETION)
+  message(STATUS "Event-driven completion is enabled")
+else()
+  message(STATUS "Event-driven completion is disabled")
+endif()
 
 if (USE_NVMEOF)
   set(USE_CUDA ON)
diff --git a/mooncake-store/src/transfer_task.cpp b/mooncake-store/src/transfer_task.cpp
@@ -5,6 +5,7 @@
 #include <algorithm>
 #include <cstdlib>
 #include "transfer_engine.h"
+#include "transport/transport.h"
 
 namespace mooncake {
 
@@ -286,19 +287,64 @@ void TransferEngineOperationState::set_result_internal(ErrorCode error_code) {
     VLOG(1) << "Setting transfer result for batch " << batch_id_ << " to "
             << static_cast<int>(error_code);
     result_.emplace(error_code);
-
-    cv_.notify_all();
 }
 
 void TransferEngineOperationState::wait_for_completion() {
     if (is_completed()) {
         return;
     }
 
-    VLOG(1) << "Starting transfer engine polling for batch " << batch_id_;
     constexpr int64_t timeout_seconds = 60;
-    constexpr int64_t kOneSecondInNano = 1000 * 1000 * 1000;
 
+#ifdef USE_EVENT_DRIVEN_COMPLETION
+    VLOG(1) << "Waiting for transfer engine completion for batch " << batch_id_;
+
+    // Wait directly on BatchDesc's condition variable.
+    auto& batch_desc = Transport::toBatchDesc(batch_id_);
+    bool completed;
+    bool failed = false;
+
+    // Fast path: if already finished, avoid taking the mutex and waiting.
+    // Use acquire here to pair with the writer's release-store, because this
+    // path may skip taking the mutex. It ensures all prior updates are visible.
+    completed = batch_desc.is_finished.load(std::memory_order_acquire);
+    if (!completed) {
+        // Use the same mutex as the notifier when updating the predicate to
+        // avoid missed notifications. The predicate is re-checked under the
+        // lock. Under the mutex, relaxed is sufficient; the mutex acquire
+        // orders prior writes.
+        std::unique_lock<std::mutex> lock(batch_desc.completion_mutex);
+        completed = batch_desc.completion_cv.wait_for(
+            lock, std::chrono::seconds(timeout_seconds), [&batch_desc] {
+                return batch_desc.is_finished.load(std::memory_order_relaxed);
+            });
+    }  // Explicitly release completion_mutex before acquiring mutex_
+
+    // Once completion is observed, read failure flag.
+    if (completed) {
+        failed = batch_desc.has_failure.load(std::memory_order_relaxed);
+    }
+
+    ErrorCode error_code =
+        completed ? (failed ? ErrorCode::TRANSFER_FAIL : ErrorCode::OK)
+                  : ErrorCode::TRANSFER_FAIL;
+
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        set_result_internal(error_code);
+    }
+
+    if (completed) {
+        VLOG(1) << "Transfer engine operation completed for batch " << batch_id_
+                << " with result: " << static_cast<int>(error_code);
+    } else {
+        LOG(ERROR) << "Failed to complete transfers after " << timeout_seconds
+                   << " seconds for batch " << batch_id_;
+    }
+#else
+    VLOG(1) << "Starting transfer engine polling for batch " << batch_id_;
+
+    constexpr int64_t kOneSecondInNano = 1000 * 1000 * 1000;
     const int64_t start_ts = getCurrentTimeInNano();
 
     while (true) {
@@ -322,6 +368,7 @@ void TransferEngineOperationState::wait_for_completion() {
         VLOG(1) << "Transfer engine operation still pending for batch "
                 << batch_id_;
     }
+#endif
 }
 
 // ============================================================================
diff --git a/mooncake-transfer-engine/include/transport/transport.h b/mooncake-transfer-engine/include/transport/transport.h
@@ -26,6 +26,10 @@
 #include <memory>
 #include <queue>
 #include <string>
+#include <atomic>
+#include <functional>
+#include <mutex>
+#include <condition_variable>
 
 #include "common/base/status.h"
 #include "transfer_metadata.h"
@@ -76,8 +80,24 @@ class Transport {
         size_t transferred_bytes;
     };
 
+    struct BatchDesc;
     struct TransferTask;
 
+    // NOTE ABOUT BatchID → BatchDesc conversion:
+    //
+    // BatchID is an opaque 64‑bit unsigned integer that carries a
+    // BatchDesc pointer value. For performance reasons, this helper
+    // reinterprets the integral handle directly as a BatchDesc
+    // reference.
+    //
+    // The conversion intentionally bypasses any map or lookup to
+    // minimize overhead on hot paths. The caller must ensure that
+    // the underlying BatchDesc object remains alive and valid for
+    // as long as the handle is in use.
+    static inline BatchDesc &toBatchDesc(BatchID id) {
+        return *reinterpret_cast<BatchDesc *>(id);
+    }
+
     // Slice must be allocated on heap, as it will delete self on markSuccess
     // or markFailed.
     struct Slice {
@@ -128,16 +148,76 @@ class Transport {
        public:
         void markSuccess() {
             status = Slice::SUCCESS;
-            __sync_fetch_and_add(&task->transferred_bytes, length);
-            __sync_fetch_and_add(&task->success_slice_count, 1);
+            __atomic_fetch_add(&task->transferred_bytes, length,
+                               __ATOMIC_RELAXED);
+            __atomic_fetch_add(&task->success_slice_count, 1, __ATOMIC_RELAXED);
+
+            check_batch_completion(false);
         }
 
         void markFailed() {
             status = Slice::FAILED;
-            __sync_fetch_and_add(&task->failed_slice_count, 1);
+            __atomic_fetch_add(&task->failed_slice_count, 1, __ATOMIC_RELAXED);
+
+            check_batch_completion(true);
         }
 
         volatile int64_t ts;
+
+       private:
+        inline void check_batch_completion(bool is_failed) {
+#ifdef USE_EVENT_DRIVEN_COMPLETION
+            auto &batch_desc = toBatchDesc(task->batch_id);
+            if (is_failed) {
+                batch_desc.has_failure.store(true, std::memory_order_relaxed);
+            }
+
+            // When the last slice of a task completes, check if the entire task
+            // is done using a single atomic counter to avoid reading
+            // inconsistent results.
+            uint64_t prev_completed = __atomic_fetch_add(
+                &task->completed_slice_count, 1, __ATOMIC_RELAXED);
+
+            // Only the thread completing the final slice will see prev+1 ==
+            // slice_count.
+            if (prev_completed + 1 == task->slice_count) {
+                __atomic_store_n(&task->is_finished, true, __ATOMIC_RELAXED);
+
+                // Increment the number of finished tasks in the batch
+                // (relaxed). This counter does not itself publish data; only
+                // the thread that observes the last task completion performs
+                // the release-store on batch_desc.is_finished below. The waiter
+                // pairs this with an acquire load, which makes all prior writes
+                // (including relaxed increments) visible.
+                //
+                // check if this is the last task in the batch
+                auto prev = batch_desc.finished_task_count.fetch_add(
+                    1, std::memory_order_relaxed);
+
+                // Last task in the batch: wake up waiting thread directly
+                if (prev + 1 == batch_desc.batch_size) {
+                    // Publish completion of the entire batch under the same
+                    // mutex used by the waiter to avoid lost notifications.
+                    //
+                    // Keep a release-store because the reader has a fast path
+                    // that may observe completion without taking the mutex. The
+                    // acquire load in that fast path pairs with this release to
+                    // make all prior updates visible. For the predicate checked
+                    // under the mutex, relaxed would suffice since the mutex
+                    // acquire provides the necessary visibility.
+                    {
+                        std::lock_guard<std::mutex> lock(
+                            batch_desc.completion_mutex);
+                        batch_desc.is_finished.store(true,
+                                                     std::memory_order_release);
+                    }
+                    // Notify after releasing the lock to avoid waking threads
+                    // only to block again on the mutex.
+                    batch_desc.completion_cv.notify_all();
+                }
+            }
+#endif
+        }
     };
 
     struct ThreadLocalSliceCache {
@@ -198,6 +278,10 @@ class Transport {
         uint64_t total_bytes = 0;
         BatchID batch_id = 0;
 
+#ifdef USE_EVENT_DRIVEN_COMPLETION
+        volatile uint64_t completed_slice_count = 0;
+#endif
+
         // record the origin request
 #ifdef USE_ASCEND_HETEROGENEOUS
         // need to modify the request's source address, changing it from an NPU
@@ -220,6 +304,18 @@ class Transport {
         std::vector<TransferTask> task_list;
         void *context;  // for transport implementers.
         int64_t start_timestamp;
+
+#ifdef USE_EVENT_DRIVEN_COMPLETION
+        // Event-driven completion: tracks batch progress and notifies waiters
+        std::atomic<uint64_t> finished_task_count{0};
+        std::atomic<bool> has_failure{false};
+        std::atomic<bool> is_finished{
+            false};  // Completion flag for wait predicate
+
+        // Synchronization primitives for direct notification
+        std::mutex completion_mutex;
+        std::condition_variable completion_cv;
+#endif
     };
 
    public: