From 367ace484bd341f623aa2d9cdcc33122164269da Mon Sep 17 00:00:00 2001
From: Minh Quan Ho <minh-quan.ho@sipearl.com>
Date: Mon, 29 Apr 2024 11:19:14 +0200
Subject: [PATCH 1/2] OPAL: re-enable async progress thread

- The SW-based async progress thread has been planned long time ago in
  683efcb, but has never been enabled/implemented since.
- This commit enables the spawn of an async progress thread to execute
  _opal_progress() routine when enabled at both compile time and runtime
  (--enable-progress-threads (default=enabled) and MCA variables
  opal_async_progress or mpi_async_progress).
- Fix minor typo in opal_progress.h doxygen comment

Signed-off-by: Minh Quan Ho <minh-quan.ho@sipearl.com>
---
 config/opal_configure_options.m4           |  22 +++-
 ompi/instance/instance.c                   |   6 +-
 ompi/mpi/c/request_get_status.c.in         |  10 +-
 ompi/request/req_test.c                    |  26 ++---
 ompi/runtime/ompi_mpi_finalize.c           |   7 +-
 ompi/runtime/ompi_mpi_init.c               |   8 +-
 ompi/runtime/ompi_mpi_params.c             |   7 ++
 opal/mca/btl/smcuda/btl_smcuda.c           |   2 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c |   2 +-
 opal/runtime/opal_params_core.c            |  12 ++
 opal/runtime/opal_params_core.h            |  10 ++
 opal/runtime/opal_progress.c               | 130 ++++++++++++++++++++-
 opal/runtime/opal_progress.h               |   5 +-
 13 files changed, 204 insertions(+), 43 deletions(-)

diff --git a/config/opal_configure_options.m4 b/config/opal_configure_options.m4
index 66a5b3e5a39..9ab702295a4 100644
--- a/config/opal_configure_options.m4
+++ b/config/opal_configure_options.m4
@@ -544,9 +544,21 @@ fi
 AC_DEFINE_UNQUOTED([OPAL_ENABLE_GETPWUID], [$opal_want_getpwuid],
                    [Disable getpwuid support (default: enabled)])
 
-dnl We no longer support the old OPAL_ENABLE_PROGRESS_THREADS.  At
-dnl some point, this should die.
-AC_DEFINE([OPAL_ENABLE_PROGRESS_THREADS],
-          [0],
-          [Whether we want BTL progress threads enabled])
+#
+# Disable progress threads
+#
+AC_MSG_CHECKING([if want asynchronous progress threads])
+AC_ARG_ENABLE([progress_threads],
+    [AS_HELP_STRING([--disable-progress-threads],
+                   [Disable asynchronous progress threads. Note that when enabled, for performance-related reasons, the progress thread is still not spawned by default. User must enable MCA variables 'opal_async_progress' or 'mpi_async_progress' to have the progress thread spawned at runtime. (default: enabled)])])
+if test "$enable_progress_threads" = "no"; then
+    AC_MSG_RESULT([no])
+    opal_want_progress_threads=0
+else
+    AC_MSG_RESULT([yes])
+    opal_want_progress_threads=1
+fi
+AC_DEFINE_UNQUOTED([OPAL_ENABLE_PROGRESS_THREADS], [$opal_want_progress_threads],
+    [Disable BTL asynchronous progress threads (default: enabled)])
+
 ])dnl
diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c
index ff140e98891..05f00a0ba44 100644
--- a/ompi/instance/instance.c
+++ b/ompi/instance/instance.c
@@ -512,7 +512,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
        ddt_init, but before mca_coll_base_open, since some collective
        modules (e.g., the hierarchical coll component) may need ops in
        their query function. */
-    if (OMPI_SUCCESS != (ret = ompi_op_base_find_available (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
+    if (OMPI_SUCCESS != (ret = ompi_op_base_find_available (opal_async_progress_thread_spawned, ompi_mpi_thread_multiple))) {
         return ompi_instance_print_error ("ompi_op_base_find_available() failed", ret);
     }
 
@@ -532,7 +532,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
         return ompi_instance_print_error ("mca_smsc_base_select() failed", ret);
     }
 
-    if (OMPI_SUCCESS != (ret = mca_pml_base_select (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
+    if (OMPI_SUCCESS != (ret = mca_pml_base_select (opal_async_progress_thread_spawned, ompi_mpi_thread_multiple))) {
         return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
     }
 
@@ -617,7 +617,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
         return ompi_instance_print_error ("mca_pml_base_bsend_init() failed", ret);
     }
 
-    if (OMPI_SUCCESS != (ret = mca_coll_base_find_available (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
+    if (OMPI_SUCCESS != (ret = mca_coll_base_find_available (opal_async_progress_thread_spawned, ompi_mpi_thread_multiple))) {
         return ompi_instance_print_error ("mca_coll_base_find_available() failed", ret);
     }
 
diff --git a/ompi/mpi/c/request_get_status.c.in b/ompi/mpi/c/request_get_status.c.in
index 86ba237866b..fdc18339e15 100644
--- a/ompi/mpi/c/request_get_status.c.in
+++ b/ompi/mpi/c/request_get_status.c.in
@@ -38,9 +38,7 @@
 PROTOTYPE ERROR_CLASS request_get_status(REQUEST request, INT_OUT flag,
                                          STATUS_OUT status)
 {
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
     int do_it_once = 0;
-#endif
 
     MEMCHECKER(
         memchecker_request(&request);
@@ -56,9 +54,7 @@ PROTOTYPE ERROR_CLASS request_get_status(REQUEST request, INT_OUT flag,
         }
     }
 
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
  recheck_request_status:
-#endif
     opal_atomic_mb();
     if( (request == MPI_REQUEST_NULL) || (request->req_state == OMPI_REQUEST_INACTIVE) ) {
         *flag = true;
@@ -80,8 +76,8 @@ PROTOTYPE ERROR_CLASS request_get_status(REQUEST request, INT_OUT flag,
         }
         return MPI_SUCCESS;
     }
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-    if( 0 == do_it_once ) {
+
+    if( 0 == do_it_once && !opal_async_progress_thread_spawned ) {
         /* If we run the opal_progress then check the status of the
            request before leaving. We will call the opal_progress only
            once per call. */
@@ -89,7 +85,7 @@ PROTOTYPE ERROR_CLASS request_get_status(REQUEST request, INT_OUT flag,
         do_it_once++;
         goto recheck_request_status;
     }
-#endif
+
     *flag = false;
     return MPI_SUCCESS;
 }
diff --git a/ompi/request/req_test.c b/ompi/request/req_test.c
index b28ade8a67a..20ecc75f0cf 100644
--- a/ompi/request/req_test.c
+++ b/ompi/request/req_test.c
@@ -32,11 +32,9 @@ int ompi_request_default_test(ompi_request_t ** rptr,
 {
     ompi_request_t *request = *rptr;
 
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
     int do_it_once = 0;
 
 recheck_request_status:
-#endif
     if( request->req_state == OMPI_REQUEST_INACTIVE ) {
         *completed = true;
         if (MPI_STATUS_IGNORE != status) {
@@ -81,8 +79,8 @@ int ompi_request_default_test(ompi_request_t ** rptr,
         return MPI_ERR_PROC_FAILED_PENDING;
     }
 #endif
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-    if( 0 == do_it_once ) {
+
+    if( 0 == do_it_once && !opal_async_progress_thread_spawned ) {
         /**
          * If we run the opal_progress then check the status of the request before
          * leaving. We will call the opal_progress only once per call.
@@ -92,7 +90,7 @@ int ompi_request_default_test(ompi_request_t ** rptr,
             goto recheck_request_status;
         }
     }
-#endif
+
     *completed = false;
     return OMPI_SUCCESS;
 }
@@ -163,9 +161,9 @@ int ompi_request_default_test_any(
     *index = MPI_UNDEFINED;
     if(num_requests_null_inactive != count) {
         *completed = false;
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-        opal_progress();
-#endif
+        if (!opal_async_progress_thread_spawned) {
+            opal_progress();
+        }
     } else {
         *completed = true;
         if (MPI_STATUS_IGNORE != status) {
@@ -208,8 +206,8 @@ int ompi_request_default_test_all(
             return MPI_ERR_PROC_FAILED_PENDING;
         }
 #endif /* OPAL_ENABLE_FT_MPI */
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-        if (0 == do_it_once) {
+
+        if (0 == do_it_once && !opal_async_progress_thread_spawned) {
             ++do_it_once;
             if (0 != opal_progress()) {
                 /* continue walking the list, retest the current request */
@@ -217,7 +215,7 @@ int ompi_request_default_test_all(
                 continue;
             }
         }
-#endif /* OPAL_ENABLE_PROGRESS_THREADS */
+
         /* short-circuit */
         break;
     }
@@ -353,9 +351,9 @@ int ompi_request_default_test_some(
     *outcount = num_requests_done;
 
     if (num_requests_done == 0) {
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-        opal_progress();
-#endif
+        if (!opal_async_progress_thread_spawned) {
+            opal_progress();
+        }
         return OMPI_SUCCESS;
     }
 
diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c
index ad8a328dc55..8e4d7e66c52 100644
--- a/ompi/runtime/ompi_mpi_finalize.c
+++ b/ompi/runtime/ompi_mpi_finalize.c
@@ -193,9 +193,10 @@ int ompi_mpi_finalize(void)
     opal_atomic_swap_32(&ompi_mpi_state,
                         OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
 
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK);
-#endif
+    /* shutdown async progress thread before tearing down further services */
+    if (opal_async_progress_thread_spawned) {
+        opal_progress_shutdown_async_progress_thread();
+    }
 
     /* NOTE: MPI-2.1 requires that MPI_FINALIZE is "collective" across
        *all* connected processes.  This only means that all processes
diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c
index 787e1e10249..10b2b780812 100644
--- a/ompi/runtime/ompi_mpi_init.c
+++ b/ompi/runtime/ompi_mpi_init.c
@@ -559,16 +559,16 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
        time if so, then start the clock again */
     OMPI_TIMING_NEXT("barrier");
 
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
     /* Start setting up the event engine for MPI operations.  Don't
        block in the event library, so that communications don't take
        forever between procs in the dynamic code.  This will increase
        CPU utilization for the remainder of MPI_INIT when we are
        blocking on RTE-level events, but may greatly reduce non-TCP
        latency. */
-    int old_event_flags = opal_progress_set_event_flag(0);
-    opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
-#endif
+    if (!opal_async_progress_thread_spawned) {
+        int old_event_flags = opal_progress_set_event_flag(0);
+        opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
+    }
 
     /* wire up the mpi interface, if requested.  Do this after the
        non-block switch for non-TCP performance.  Do before the
diff --git a/ompi/runtime/ompi_mpi_params.c b/ompi/runtime/ompi_mpi_params.c
index c747d55ee7d..b30fb6bed27 100644
--- a/ompi/runtime/ompi_mpi_params.c
+++ b/ompi/runtime/ompi_mpi_params.c
@@ -351,6 +351,13 @@ int ompi_mpi_register_params(void)
                                  MCA_BASE_VAR_SCOPE_READONLY,
                                  &ompi_async_mpi_finalize);
 
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    value = mca_base_var_find ("opal", "opal", NULL, "async_progress");
+    if (0 <= value) {
+        (void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "async_progress", 0);
+    }
+#endif
+
     value = mca_base_var_find ("opal", "opal", NULL, "abort_delay");
     if (0 <= value) {
         (void) mca_base_var_register_synonym(value, "ompi", "mpi", NULL, "abort_delay",
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index c4389d422f2..ad2537a0c12 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -487,7 +487,7 @@ static struct mca_btl_base_endpoint_t *create_sm_endpoint(int local_proc, struct
     OBJ_CONSTRUCT(&ep->endpoint_lock, opal_mutex_t);
 #if OPAL_ENABLE_PROGRESS_THREADS == 1
     sprintf(path, "%s" OPAL_PATH_SEP "sm_fifo.%lu", opal_process_info.job_session_dir,
-            (unsigned long) proc->proc_name);
+            (unsigned long) proc->proc_name.vpid);
     ep->fifo_fd = open(path, O_WRONLY);
     if (ep->fifo_fd < 0) {
         opal_output(0, "mca_btl_smcuda_add_procs: open(%s) failed with errno=%d\n", path, errno);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 72b75d67311..51981a0a08a 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -860,7 +860,7 @@ mca_btl_smcuda_component_init(int *num_btls, bool enable_progress_threads, bool
 #if OPAL_ENABLE_PROGRESS_THREADS == 1
     /* create a named pipe to receive events  */
     sprintf(mca_btl_smcuda_component.sm_fifo_path, "%s" OPAL_PATH_SEP "sm_fifo.%lu",
-            opal_process_info.job_session_dir, (unsigned long) OPAL_PROC_MY_NAME->vpid);
+            opal_process_info.job_session_dir, (unsigned long) OPAL_PROC_MY_NAME.vpid);
     if (mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) {
         opal_output(0, "mca_btl_smcuda_component_init: mkfifo failed with errno=%d\n", errno);
         return NULL;
diff --git a/opal/runtime/opal_params_core.c b/opal/runtime/opal_params_core.c
index 803c06fb9f4..18dd942471c 100644
--- a/opal/runtime/opal_params_core.c
+++ b/opal/runtime/opal_params_core.c
@@ -90,6 +90,10 @@ int opal_abort_delay = 0;
 
 int opal_max_thread_in_progress = 1;
 
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+bool opal_async_progress = false;
+#endif
+
 static bool opal_register_util_done = false;
 
 static char *opal_var_dump_color_string = NULL;
@@ -416,6 +420,14 @@ int opal_register_util_params(void)
                                  MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_8,
                                  MCA_BASE_VAR_SCOPE_READONLY, &opal_max_thread_in_progress);
 
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    /* Spawn a dedicated software progress-thread to execute opal_progress() */
+    (void) mca_base_var_register("opal", "opal", NULL, "async_progress",
+                                 "Spawn a dedicated software progress-thread. Default: false",
+                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9,
+                                 MCA_BASE_VAR_SCOPE_READONLY, &opal_async_progress);
+#endif
+
     /* Use sync_memops functionality with accelerator codes or deploy
        alternative path using IPC events to ensure consistency */
     opal_accelerator_use_sync_memops = true;
diff --git a/opal/runtime/opal_params_core.h b/opal/runtime/opal_params_core.h
index b90d3fc2961..54f3d9298fb 100644
--- a/opal/runtime/opal_params_core.h
+++ b/opal/runtime/opal_params_core.h
@@ -60,6 +60,16 @@ OPAL_DECLSPEC extern bool opal_built_with_cuda_support;
 OPAL_DECLSPEC extern bool opal_built_with_rocm_support;
 OPAL_DECLSPEC extern bool opal_built_with_ze_support;
 
+#    if OPAL_ENABLE_PROGRESS_THREADS == 1
+OPAL_DECLSPEC extern bool opal_async_progress_thread_spawned;
+#    else
+/* When disabled at configure, using 'static const' will allow compilers to
+ * not only evaluate the boolean at compile time, but also to remove its
+ * storage within any object file which is including this header.
+ * See https://godbolt.org/z/hxc8EjPa4  */
+OPAL_DECLSPEC static const bool opal_async_progress_thread_spawned = false;
+#    endif
+
 /**
  *  * Whether we want to enable CUDA GPU buffer send and receive support.
  *   */
diff --git a/opal/runtime/opal_progress.c b/opal/runtime/opal_progress.c
index 7bca660b9d6..c622609a4ef 100644
--- a/opal/runtime/opal_progress.c
+++ b/opal/runtime/opal_progress.c
@@ -50,6 +50,36 @@ bool opal_progress_debug = false;
 static int opal_progress_event_flag = OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK;
 int opal_progress_spin_count = 10000;
 
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+/* MCA OPAL parameter */
+extern bool opal_async_progress;
+
+/* Track whether the async progress thread was successfully spawned.
+ * This boolean is set at init, and read at finalize to join the
+ * progress thread */
+bool opal_async_progress_thread_spawned = false;
+
+/* async progress thread info & args */
+typedef struct thread_args_s {
+    /* number of events reported.
+     * This is updated by the async progress thread, to be read and resetted
+     * by the application threads */
+    opal_atomic_int64_t nb_events_reported;
+
+    /* should continue running ? */
+    volatile bool running;
+} thread_args_t;
+
+/* async progress thread routine */
+static void *opal_progress_async_thread_engine(opal_object_t *obj);
+
+static opal_thread_t opal_progress_async_thread;
+static thread_args_t thread_arg = {
+    .nb_events_reported = 0,
+    .running = false
+};
+#endif
+
 /*
  * Local variables
  */
@@ -119,9 +149,29 @@ static void opal_progress_finalize(void)
     opal_atomic_unlock(&progress_lock);
 }
 
+void opal_progress_shutdown_async_progress_thread(void)
+{
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    if (opal_async_progress_thread_spawned) {
+        /* shutdown the async thread */
+        thread_arg.running = false;
+        int err = opal_thread_join(&opal_progress_async_thread, NULL);
+        if (OPAL_SUCCESS == err) {
+            opal_set_using_threads(false);
+            opal_async_progress_thread_spawned = false;
+        } else {
+            OPAL_OUTPUT(
+                (debug_output, "progress: Failed to join async progress thread: err=%d", err));
+        }
+    }
+#endif
+}
+
 /* init the progress engine - called from orte_init */
 int opal_progress_init(void)
 {
+    int err = OPAL_SUCCESS;
+
     /* reentrant issues */
     opal_atomic_lock_init(&progress_lock, OPAL_ATOMIC_LOCK_UNLOCKED);
 
@@ -155,6 +205,32 @@ int opal_progress_init(void)
         callbacks_lp[i] = fake_cb;
     }
 
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    if (opal_async_progress && !opal_async_progress_thread_spawned) {
+        /* prepare the thread */
+        thread_arg.nb_events_reported = 0;
+        thread_arg.running = true;
+        OBJ_CONSTRUCT(&opal_progress_async_thread, opal_thread_t);
+        opal_progress_async_thread.t_run = opal_progress_async_thread_engine;
+        opal_progress_async_thread.t_arg = &thread_arg;
+
+        /* optimistic setting for asynchronism here, but we know that these might
+        still be changed by other *init() routines :( */
+        opal_progress_set_yield_when_idle(true);
+        opal_progress_set_event_flag(opal_progress_event_flag | OPAL_EVLOOP_NONBLOCK);
+
+        err = opal_thread_start(&opal_progress_async_thread);
+        if (OPAL_SUCCESS == err) {
+            opal_set_using_threads(true);
+            opal_async_progress_thread_spawned = true;
+        } else {
+            thread_arg.running = false;
+            OPAL_OUTPUT(
+                (debug_output, "progress: Failed to start async progress thread: err=%d", err));
+        }
+    }
+#endif
+
     OPAL_OUTPUT(
         (debug_output, "progress: initialized event flag to: %x", opal_progress_event_flag));
     OPAL_OUTPUT((debug_output, "progress: initialized yield_when_idle to: %s",
@@ -165,7 +241,7 @@ int opal_progress_init(void)
 
     opal_finalize_register_cleanup(opal_progress_finalize);
 
-    return OPAL_SUCCESS;
+    return err;
 }
 
 static int opal_progress_events(void)
@@ -213,7 +289,7 @@ static int opal_progress_events(void)
  * care, as the cost of that happening is far outweighed by the cost
  * of the if checks (they were resulting in bad pipe stalling behavior)
  */
-int opal_progress(void)
+static int _opal_progress(void)
 {
     static uint32_t num_calls = 0;
     size_t i;
@@ -224,13 +300,17 @@ int opal_progress(void)
         events += (callbacks[i])();
     }
 
-    /* Run low priority callbacks and events once every 8 calls to opal_progress().
+    /* Run low priority callbacks and events once every <N> calls to opal_progress().
      * Even though "num_calls" can be modified by multiple threads, we do not use
      * atomic operations here, for performance reasons. In case of a race, the
      * number of calls may be inaccurate, but since it will eventually be incremented,
      * it's not a problem.
+     * If opal_async_progress_thread_spawned == false, then N = 8
+     * otherwise let's pick N = 256 for the moment (George's recommendation) and
+     * adapt it later if it takes too many resources.
      */
-    if (((num_calls++) & 0x7) == 0) {
+    const uint32_t mod = opal_async_progress_thread_spawned ? 0xFF : 0x7;
+    if (((num_calls++) & mod) == 0) {
         for (i = 0; i < callbacks_lp_len; ++i) {
             events += (callbacks_lp[i])();
         }
@@ -254,6 +334,48 @@ int opal_progress(void)
     return events;
 }
 
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+static void *opal_progress_async_thread_engine(opal_object_t *obj)
+{
+    opal_thread_t *current_thread = (opal_thread_t *) obj;
+    thread_args_t *p_thread_arg = (thread_args_t *) current_thread->t_arg;
+
+    while (p_thread_arg->running) {
+        const int64_t new_events = _opal_progress();
+        if (new_events > 0) {
+            opal_atomic_add_fetch_64(&p_thread_arg->nb_events_reported, new_events);
+        }
+    }
+
+    return OPAL_THREAD_CANCELLED;
+}
+#endif
+
+int opal_progress(void)
+{
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    if (opal_async_progress_thread_spawned) {
+        /* async progress thread alongside may has processed new events,
+         * atomically read and reset nb_events_reported to zero.
+         */
+        const int64_t new_events = opal_atomic_swap_64(&thread_arg.nb_events_reported, 0);
+
+        /* if no new event, then application thread may yield here */
+        if (opal_progress_yield_when_idle && new_events <= 0) {
+            opal_thread_yield();
+        }
+        return new_events;
+    } else {
+#endif
+
+    /* no async progress thread, call the normal progress routine like before */
+    return _opal_progress();
+
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    }
+#endif
+}
+
 int opal_progress_set_event_flag(int flag)
 {
     int tmp = opal_progress_event_flag;
diff --git a/opal/runtime/opal_progress.h b/opal/runtime/opal_progress.h
index 86a0f8add50..d6a782330d8 100644
--- a/opal/runtime/opal_progress.h
+++ b/opal/runtime/opal_progress.h
@@ -142,7 +142,7 @@ OPAL_DECLSPEC void opal_progress_set_event_poll_rate(int microseconds);
  *
  * Prototype for the a progress function callback.  Progress function
  * callbacks can be registered with opal_progress_register() and
- * deregistered with opal_progress_deregister().  It should be noted
+ * deregistered with opal_progress_unregister().  It should be noted
  * that either registering or deregistering a function callback is an
  * extraordinarily expensive operation and should not be used for
  * potentially short callback lifetimes.
@@ -195,6 +195,9 @@ static inline bool opal_progress_spin(volatile bool *complete)
     return false;
 }
 
+/* shutdown the async progress thread. Do nothing if disabled */
+OPAL_DECLSPEC void opal_progress_shutdown_async_progress_thread(void);
+
 END_C_DECLS
 
 #endif

From 0c6839e48b5a6b908152f96514cd8f789c065190 Mon Sep 17 00:00:00 2001
From: Minh Quan Ho <minh-quan.ho@sipearl.com>
Date: Thu, 19 Jun 2025 15:17:20 +0200
Subject: [PATCH 2/2] docs: add documentation on async progress thread

Signed-off-by: Minh Quan Ho <minh-quan.ho@sipearl.com>
---
 .../configure-cli-options/misc.rst            | 13 ++++
 docs/launching-apps/index.rst                 |  1 +
 docs/launching-apps/progress_thread.rst       | 75 +++++++++++++++++++
 3 files changed, 89 insertions(+)
 create mode 100644 docs/launching-apps/progress_thread.rst

diff --git a/docs/installing-open-mpi/configure-cli-options/misc.rst b/docs/installing-open-mpi/configure-cli-options/misc.rst
index b6d263213a5..8b1c8c09184 100644
--- a/docs/installing-open-mpi/configure-cli-options/misc.rst
+++ b/docs/installing-open-mpi/configure-cli-options/misc.rst
@@ -34,6 +34,19 @@ above categories that can be used with ``configure``:
   .. danger:: The heterogeneous functionality is currently broken |mdash|
               do not use.
 
+* ``--enable-progress-threads``
+* ``--disable-progress-threads``:
+  Enable or disable (default = ``enabled``) support of software-based progress
+  thread for each MPI process to execute the internal communication progression
+  engine. Note that even when the support is built, the progress thread is not
+  spawned by default at runtime. This behavior is controlled by the associated
+  runtime MCA variable ``opal_async_progress`` or ``mpi_async_progress``
+  (default = false).
+
+  .. warning:: Be aware of performance degradation. Please read
+               :ref:`this section <async-progress-thread-label>` for
+               more documentation.
+
 .. _install-wrapper-flags-label:
 
 * ``--with-wrapper-cflags=CFLAGS``
diff --git a/docs/launching-apps/index.rst b/docs/launching-apps/index.rst
index 56b02f525da..fa86dd99cf2 100644
--- a/docs/launching-apps/index.rst
+++ b/docs/launching-apps/index.rst
@@ -39,6 +39,7 @@ same command).
    prerequisites
    pmix-and-prrte
    scheduling
+   progress_thread
 
    localhost
    ssh
diff --git a/docs/launching-apps/progress_thread.rst b/docs/launching-apps/progress_thread.rst
new file mode 100644
index 00000000000..92ad41f356c
--- /dev/null
+++ b/docs/launching-apps/progress_thread.rst
@@ -0,0 +1,75 @@
+.. _async-progress-thread-label:
+
+Asynchronous progress thread
+============================
+
+Open MPI provides an experimental support of software-based asynchronous
+progress thread. This progress thread is in charge of running internal
+progression engine in the background to advance non-blocking overlapping
+communication.
+
+Enabling progress thread at configuration time
+----------------------------------------------
+
+The feature is can be enabled or disabled at configuration by passing
+``--enable-progress-threads`` or ``--disable-progress-threads`` to
+``configure``. The default state is enabled.
+
+Enabling progress thread at runtime
+-----------------------------------
+
+When Open MPI was configured and built with ``--enable-progress-threads``, the
+progress thread is still deactivated at runtime by default.
+
+The progress thread can be activated by setting one of the following
+MCA boolean variables in the launching command:
+
+.. code-block:: sh
+
+   shell$ mpirun --mca opal_async_progress 1 ...
+   shell$ mpirun --mca mpi_async_progress 1 ...
+   shell$ OMPI_MCA_opal_async_progress=1 mpirun ...
+   shell$ OMPI_MCA_mpi_async_progress=1 mpirun ...
+
+Note that ``mpi_async_progress`` is a synonym of ``opal_async_progress``.
+
+.. warning:: Progress threads are a somewhat complicated issue. Activating them
+             at run time may improve overlap of communication and computation in
+             your application (particularly those with non-blocking communication)
+             which will improve overall performance. But there may be unintended
+             consequences which may degrade overall application performance.
+             Users are advised to experiment and see what works best for their
+             applications.
+
+Rationale
+---------
+
+A possible beneficial usecase of software progress thread is *intra-node
+shared-memory non-blocking* communication, running on some high core-count CPUs,
+on which application may not use all the available cores, or the CPU has some
+reserved cores dedicated to communication tasks. In such configurations, the
+latency of some non-blocking collective operations (e.g. ``MPI_Ireduce()``)
+can be improved thanks to arithmetic operations being performed in the
+background by the progress thread, instead of deferring the computations to
+being executed by the main thread during ``MPI_Wait()``.
+
+Alternatively, on systems where *inter-node communications* are already
+offloaded to dedicated hardware, enabling the software-based progress threads
+could degrade performance, since the additional thread will force progress up
+through the CPU and potentially away from more optimized hardware functionality.
+
+For these performance reasons, the progress thread is not activated (spawned)
+by default at runtime. It is upon developers to decide to switch on the
+progress thread, depending on their application and system setup.
+
+Limitations
+-----------
+
+#. The current implementation does not support (yet) binding the progress
+   thread to a specific core (or set of cores).
+
+#. There are still some hard-coded constant parameters in the code that
+   would require further tuning.
+
+#. It was observed that some multi-threading overhead may impact performance
+   on small buffers.