diff --git a/config/opal_configure_options.m4 b/config/opal_configure_options.m4
index 66a5b3e5a39..ddc9b88fae8 100644
--- a/config/opal_configure_options.m4
+++ b/config/opal_configure_options.m4
@@ -544,9 +544,21 @@ fi
 AC_DEFINE_UNQUOTED([OPAL_ENABLE_GETPWUID], [$opal_want_getpwuid],
                    [Disable getpwuid support (default: enabled)])
 
-dnl We no longer support the old OPAL_ENABLE_PROGRESS_THREADS.  At
-dnl some point, this should die.
-AC_DEFINE([OPAL_ENABLE_PROGRESS_THREADS],
-          [0],
-          [Whether we want BTL progress threads enabled])
+#
+# Disable progress threads
+#
+AC_MSG_CHECKING([if want asynchronous progress threads])
+AC_ARG_ENABLE([progress_threads],
+    [AS_HELP_STRING([--disable-progress-threads],
+                   [Disable asynchronous progress threads. Note that when enabled, for performance-related reasons, the progress thread is still not spawned by default. User must set env var OMPI_ASYNC_PROGRESS or OPAL_ASYNC_PROGRESS to 1 to really activate the progress thread at runtime. (default: enabled)])])
+if test "$enable_progress_threads" = "no"; then
+    AC_MSG_RESULT([no])
+    opal_want_progress_threads=0
+else
+    AC_MSG_RESULT([yes])
+    opal_want_progress_threads=1
+fi
+AC_DEFINE_UNQUOTED([OPAL_ENABLE_PROGRESS_THREADS], [$opal_want_progress_threads],
+    [Disable BTL asynchronous progress threads (default: enabled)])
+
 ])dnl
diff --git a/docs/installing-open-mpi/configure-cli-options/misc.rst b/docs/installing-open-mpi/configure-cli-options/misc.rst
index b6d263213a5..d1ce4e10926 100644
--- a/docs/installing-open-mpi/configure-cli-options/misc.rst
+++ b/docs/installing-open-mpi/configure-cli-options/misc.rst
@@ -34,6 +34,18 @@ above categories that can be used with ``configure``:
   .. danger:: The heterogeneous functionality is currently broken |mdash|
               do not use.
 
+* ``--enable-progress-threads``
+* ``--disable-progress-threads``:
+  Enable or disable (default = ``enabled``) software-based progress thread
+  for each MPI process to execute the internal communication progression
+  engine. When enabled at build time, the thread can be activated (default =
+  deactivated) at runtime by setting environment variable
+  ``OMPI_ASYNC_PROGRESS`` or ``OPAL_ASYNC_PROGRESS`` to a non-zero value.
+
+  .. warning:: Be aware of performance degradation. Please read
+               :ref:`this section <async-progress-thread-label>` for
+               more documentation.
+
 .. _install-wrapper-flags-label:
 
 * ``--with-wrapper-cflags=CFLAGS``
diff --git a/docs/launching-apps/index.rst b/docs/launching-apps/index.rst
index 56b02f525da..fa86dd99cf2 100644
--- a/docs/launching-apps/index.rst
+++ b/docs/launching-apps/index.rst
@@ -39,6 +39,7 @@ same command).
    prerequisites
    pmix-and-prrte
    scheduling
+   progress_thread
 
    localhost
    ssh
diff --git a/docs/launching-apps/progress_thread.rst b/docs/launching-apps/progress_thread.rst
new file mode 100644
index 00000000000..5bf099d996b
--- /dev/null
+++ b/docs/launching-apps/progress_thread.rst
@@ -0,0 +1,70 @@
+.. _async-progress-thread-label:
+
+Asynchronous progress thread
+============================
+
+OpenMPI provides an experimental support of software-based asynchronous
+progress thread. This progress thread is in charge of running internal
+progression engine in the background to advance non-blocking overlapping
+communication.
+
+Enabling progress thread at configuration time
+----------------------------------------------
+
+The feature is can be enabled or disabled at configuration by passing
+``--enable-progress-threads`` or ``--disable-progress-threads`` to
+``configure``. The default state is enabled.
+
+Enabling progress thread at runtime
+-----------------------------------
+
+When OpenMPI was built with ``--enable-progress-threads``, the progress
+thread is still deactivated at runtime by default.
+
+The progress thread can be activated by setting one of the following
+environment variables to any non-zero value in the launching command:
+
+.. code-block:: sh
+
+   shell$ OMPI_ASYNC_PROGRESS=1 mpirun ...
+   shell$ OPAL_ASYNC_PROGRESS=7 mpirun ...
+
+When both variables are set, ``OMPI_ASYNC_PROGRESS`` will be preferred to
+``OPAL_ASYNC_PROGRESS``.
+
+.. warning:: Activating progress thread at runtime may bring more harm
+             than improvement to performance. User should try it with caution.
+
+Rationale
+---------
+
+A possible beneficial usecase of software progress thread is *intra-node
+shared-mem non-blocking* communication, running on some high core-count CPUs,
+on which application may not use all the available cores, or the CPU has some
+reserved cores dedicated to communication tasks. In such configuration, the
+latency of some non-blocking collective operations (e.g. ``MPI_Ireduce()``)
+can be improved thanks to arithmetic operations being performed in the
+background by the progress thread, instead of the main thread which would only
+perform computation lastly at the moment of ``MPI_Wait()``, as currently
+implemented in OpenMPI.
+
+On another hand, on systems where *inter-node communication* was already
+offloaded to an appropriate BTL backend and associated hardware devices,
+enabling the software-based progress thread could be harmful to performance,
+since the additional thread will take up more CPU time for mostly no utility.
+
+For these performance reasons, the progress thread is not activated (spawned)
+by default at runtime. It is upon developers to decide to switch on the
+progress thread, depending on their application and system setup.
+
+Limitations
+-----------
+
+#. The current implementation does not support (yet) binding the progress
+   thread to a specific core (or set of cores).
+
+#. There are still some hard-coded constant parameters in the code that
+   would require further tuning.
+
+#. It was observed that some multi-threading overhead may impact performance
+   on small buffers.
diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c
index 0b83e442b0c..d15d582faa3 100644
--- a/ompi/instance/instance.c
+++ b/ompi/instance/instance.c
@@ -512,7 +512,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
        ddt_init, but before mca_coll_base_open, since some collective
        modules (e.g., the hierarchical coll component) may need ops in
        their query function. */
-    if (OMPI_SUCCESS != (ret = ompi_op_base_find_available (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
+    if (OMPI_SUCCESS != (ret = ompi_op_base_find_available (opal_async_progress_thread_spawned, ompi_mpi_thread_multiple))) {
         return ompi_instance_print_error ("ompi_op_base_find_available() failed", ret);
     }
 
@@ -532,7 +532,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
         return ompi_instance_print_error ("mca_smsc_base_select() failed", ret);
     }
 
-    if (OMPI_SUCCESS != (ret = mca_pml_base_select (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
+    if (OMPI_SUCCESS != (ret = mca_pml_base_select (opal_async_progress_thread_spawned, ompi_mpi_thread_multiple))) {
         return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
     }
 
@@ -613,11 +613,11 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
         return ompi_instance_print_error ("mca_pml_base_bsend_init() failed", ret);
     }
 
-    if (OMPI_SUCCESS != (ret = mca_coll_base_find_available (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
+    if (OMPI_SUCCESS != (ret = mca_coll_base_find_available (opal_async_progress_thread_spawned, ompi_mpi_thread_multiple))) {
         return ompi_instance_print_error ("mca_coll_base_find_available() failed", ret);
     }
 
-    if (OMPI_SUCCESS != (ret = ompi_osc_base_find_available (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
+    if (OMPI_SUCCESS != (ret = ompi_osc_base_find_available (opal_async_progress_thread_spawned, ompi_mpi_thread_multiple))) {
         return ompi_instance_print_error ("ompi_osc_base_find_available() failed", ret);
     }
 
diff --git a/ompi/mpi/c/request_get_status.c.in b/ompi/mpi/c/request_get_status.c.in
index 86ba237866b..fdc18339e15 100644
--- a/ompi/mpi/c/request_get_status.c.in
+++ b/ompi/mpi/c/request_get_status.c.in
@@ -38,9 +38,7 @@
 PROTOTYPE ERROR_CLASS request_get_status(REQUEST request, INT_OUT flag,
                                          STATUS_OUT status)
 {
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
     int do_it_once = 0;
-#endif
 
     MEMCHECKER(
         memchecker_request(&request);
@@ -56,9 +54,7 @@ PROTOTYPE ERROR_CLASS request_get_status(REQUEST request, INT_OUT flag,
         }
     }
 
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
  recheck_request_status:
-#endif
     opal_atomic_mb();
     if( (request == MPI_REQUEST_NULL) || (request->req_state == OMPI_REQUEST_INACTIVE) ) {
         *flag = true;
@@ -80,8 +76,8 @@ PROTOTYPE ERROR_CLASS request_get_status(REQUEST request, INT_OUT flag,
         }
         return MPI_SUCCESS;
     }
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-    if( 0 == do_it_once ) {
+
+    if( 0 == do_it_once && !opal_async_progress_thread_spawned ) {
         /* If we run the opal_progress then check the status of the
            request before leaving. We will call the opal_progress only
            once per call. */
@@ -89,7 +85,7 @@ PROTOTYPE ERROR_CLASS request_get_status(REQUEST request, INT_OUT flag,
         do_it_once++;
         goto recheck_request_status;
     }
-#endif
+
     *flag = false;
     return MPI_SUCCESS;
 }
diff --git a/ompi/request/req_test.c b/ompi/request/req_test.c
index b28ade8a67a..20ecc75f0cf 100644
--- a/ompi/request/req_test.c
+++ b/ompi/request/req_test.c
@@ -32,11 +32,9 @@ int ompi_request_default_test(ompi_request_t ** rptr,
 {
     ompi_request_t *request = *rptr;
 
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
     int do_it_once = 0;
 
 recheck_request_status:
-#endif
     if( request->req_state == OMPI_REQUEST_INACTIVE ) {
         *completed = true;
         if (MPI_STATUS_IGNORE != status) {
@@ -81,8 +79,8 @@ int ompi_request_default_test(ompi_request_t ** rptr,
         return MPI_ERR_PROC_FAILED_PENDING;
     }
 #endif
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-    if( 0 == do_it_once ) {
+
+    if( 0 == do_it_once && !opal_async_progress_thread_spawned ) {
         /**
          * If we run the opal_progress then check the status of the request before
          * leaving. We will call the opal_progress only once per call.
@@ -92,7 +90,7 @@ int ompi_request_default_test(ompi_request_t ** rptr,
             goto recheck_request_status;
         }
     }
-#endif
+
     *completed = false;
     return OMPI_SUCCESS;
 }
@@ -163,9 +161,9 @@ int ompi_request_default_test_any(
     *index = MPI_UNDEFINED;
     if(num_requests_null_inactive != count) {
         *completed = false;
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-        opal_progress();
-#endif
+        if (!opal_async_progress_thread_spawned) {
+            opal_progress();
+        }
     } else {
         *completed = true;
         if (MPI_STATUS_IGNORE != status) {
@@ -208,8 +206,8 @@ int ompi_request_default_test_all(
             return MPI_ERR_PROC_FAILED_PENDING;
         }
 #endif /* OPAL_ENABLE_FT_MPI */
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-        if (0 == do_it_once) {
+
+        if (0 == do_it_once && !opal_async_progress_thread_spawned) {
             ++do_it_once;
             if (0 != opal_progress()) {
                 /* continue walking the list, retest the current request */
@@ -217,7 +215,7 @@ int ompi_request_default_test_all(
                 continue;
             }
         }
-#endif /* OPAL_ENABLE_PROGRESS_THREADS */
+
         /* short-circuit */
         break;
     }
@@ -353,9 +351,9 @@ int ompi_request_default_test_some(
     *outcount = num_requests_done;
 
     if (num_requests_done == 0) {
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-        opal_progress();
-#endif
+        if (!opal_async_progress_thread_spawned) {
+            opal_progress();
+        }
         return OMPI_SUCCESS;
     }
 
diff --git a/ompi/runtime/ompi_mpi_finalize.c b/ompi/runtime/ompi_mpi_finalize.c
index ad8a328dc55..8e4d7e66c52 100644
--- a/ompi/runtime/ompi_mpi_finalize.c
+++ b/ompi/runtime/ompi_mpi_finalize.c
@@ -193,9 +193,10 @@ int ompi_mpi_finalize(void)
     opal_atomic_swap_32(&ompi_mpi_state,
                         OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);
 
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
-    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK);
-#endif
+    /* shutdown async progress thread before tearing down further services */
+    if (opal_async_progress_thread_spawned) {
+        opal_progress_shutdown_async_progress_thread();
+    }
 
     /* NOTE: MPI-2.1 requires that MPI_FINALIZE is "collective" across
        *all* connected processes.  This only means that all processes
diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c
index 19c0999d163..cc539aa3b5f 100644
--- a/ompi/runtime/ompi_mpi_init.c
+++ b/ompi/runtime/ompi_mpi_init.c
@@ -522,16 +522,16 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
        time if so, then start the clock again */
     OMPI_TIMING_NEXT("barrier");
 
-#if OPAL_ENABLE_PROGRESS_THREADS == 0
     /* Start setting up the event engine for MPI operations.  Don't
        block in the event library, so that communications don't take
        forever between procs in the dynamic code.  This will increase
        CPU utilization for the remainder of MPI_INIT when we are
        blocking on RTE-level events, but may greatly reduce non-TCP
        latency. */
-    int old_event_flags = opal_progress_set_event_flag(0);
-    opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
-#endif
+    if (!opal_async_progress_thread_spawned) {
+        int old_event_flags = opal_progress_set_event_flag(0);
+        opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
+    }
 
     /* wire up the mpi interface, if requested.  Do this after the
        non-block switch for non-TCP performance.  Do before the
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index c4389d422f2..ad2537a0c12 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -487,7 +487,7 @@ static struct mca_btl_base_endpoint_t *create_sm_endpoint(int local_proc, struct
     OBJ_CONSTRUCT(&ep->endpoint_lock, opal_mutex_t);
 #if OPAL_ENABLE_PROGRESS_THREADS == 1
     sprintf(path, "%s" OPAL_PATH_SEP "sm_fifo.%lu", opal_process_info.job_session_dir,
-            (unsigned long) proc->proc_name);
+            (unsigned long) proc->proc_name.vpid);
     ep->fifo_fd = open(path, O_WRONLY);
     if (ep->fifo_fd < 0) {
         opal_output(0, "mca_btl_smcuda_add_procs: open(%s) failed with errno=%d\n", path, errno);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 72b75d67311..51981a0a08a 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -860,7 +860,7 @@ mca_btl_smcuda_component_init(int *num_btls, bool enable_progress_threads, bool
 #if OPAL_ENABLE_PROGRESS_THREADS == 1
     /* create a named pipe to receive events  */
     sprintf(mca_btl_smcuda_component.sm_fifo_path, "%s" OPAL_PATH_SEP "sm_fifo.%lu",
-            opal_process_info.job_session_dir, (unsigned long) OPAL_PROC_MY_NAME->vpid);
+            opal_process_info.job_session_dir, (unsigned long) OPAL_PROC_MY_NAME.vpid);
     if (mkfifo(mca_btl_smcuda_component.sm_fifo_path, 0660) < 0) {
         opal_output(0, "mca_btl_smcuda_component_init: mkfifo failed with errno=%d\n", errno);
         return NULL;
diff --git a/opal/runtime/opal_params_core.h b/opal/runtime/opal_params_core.h
index b90d3fc2961..54f3d9298fb 100644
--- a/opal/runtime/opal_params_core.h
+++ b/opal/runtime/opal_params_core.h
@@ -60,6 +60,16 @@ OPAL_DECLSPEC extern bool opal_built_with_cuda_support;
 OPAL_DECLSPEC extern bool opal_built_with_rocm_support;
 OPAL_DECLSPEC extern bool opal_built_with_ze_support;
 
+#    if OPAL_ENABLE_PROGRESS_THREADS == 1
+OPAL_DECLSPEC extern bool opal_async_progress_thread_spawned;
+#    else
+/* When disabled at configure, using 'static const' will allow compilers to
+ * not only evaluate the boolean at compile time, but also to remove its
+ * storage within any object file which is including this header.
+ * See https://godbolt.org/z/hxc8EjPa4  */
+OPAL_DECLSPEC static const bool opal_async_progress_thread_spawned = false;
+#    endif
+
 /**
  *  * Whether we want to enable CUDA GPU buffer send and receive support.
  *   */
diff --git a/opal/runtime/opal_progress.c b/opal/runtime/opal_progress.c
index 7bca660b9d6..a4b5bdfc3c0 100644
--- a/opal/runtime/opal_progress.c
+++ b/opal/runtime/opal_progress.c
@@ -50,6 +50,34 @@ bool opal_progress_debug = false;
 static int opal_progress_event_flag = OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK;
 int opal_progress_spin_count = 10000;
 
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+/* track whether the async progress thread was spawned */
+/* default=disabled. Enabled with runtime env var [OMPI_ASYNC_PROGRESS|OPAL_ASYNC_PROGRESS]=1
+ * Preference order : OMPI_ASYNC_PROGRESS is preferred to OPAL_ASYNC_PROGRESS if both set
+ * */
+bool opal_async_progress_thread_spawned = false;
+
+/* async progress thread info & args */
+typedef struct thread_args_s {
+    /* number of events reported.
+     * This is updated by the async progress thread, to be read and resetted
+     * by the application threads */
+    opal_atomic_int64_t nb_events_reported;
+
+    /* should continue running ? */
+    volatile bool running;
+} thread_args_t;
+
+/* async progress thread routine */
+static void *opal_progress_async_thread_engine(opal_object_t *obj);
+
+static opal_thread_t opal_progress_async_thread;
+static thread_args_t thread_arg = {
+    .nb_events_reported = 0,
+    .running = false
+};
+#endif
+
 /*
  * Local variables
  */
@@ -119,9 +147,29 @@ static void opal_progress_finalize(void)
     opal_atomic_unlock(&progress_lock);
 }
 
+void opal_progress_shutdown_async_progress_thread(void)
+{
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    if (opal_async_progress_thread_spawned) {
+        /* shutdown the async thread */
+        thread_arg.running = false;
+        int err = opal_thread_join(&opal_progress_async_thread, NULL);
+        if (OPAL_SUCCESS == err) {
+            opal_set_using_threads(false);
+            opal_async_progress_thread_spawned = false;
+        } else {
+            OPAL_OUTPUT(
+                (debug_output, "progress: Failed to join async progress thread: err=%d", err));
+        }
+    }
+#endif
+}
+
 /* init the progress engine - called from orte_init */
 int opal_progress_init(void)
 {
+    int err = OPAL_SUCCESS;
+
     /* reentrant issues */
     opal_atomic_lock_init(&progress_lock, OPAL_ATOMIC_LOCK_UNLOCKED);
 
@@ -155,6 +203,41 @@ int opal_progress_init(void)
         callbacks_lp[i] = fake_cb;
     }
 
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    // check if the env var OMPI_ASYNC_PROGRESS or OPAL_ASYNC_PROGRESS is set
+    // at runtime.
+    char *evar_ompi = getenv("OMPI_ASYNC_PROGRESS");
+    char *evar_opal = getenv("OPAL_ASYNC_PROGRESS");
+    // prefer OMPI_ASYNC_PROGRESS to OPAL_ASYNC_PROGRESS
+    char *evar = evar_ompi ? evar_ompi : evar_opal;
+
+    const bool opal_async_progress_enabled = evar ? (atoi(evar) != 0) : false;
+
+    if (opal_async_progress_enabled && !opal_async_progress_thread_spawned) {
+        /* prepare the thread */
+        thread_arg.nb_events_reported = 0;
+        thread_arg.running = true;
+        OBJ_CONSTRUCT(&opal_progress_async_thread, opal_thread_t);
+        opal_progress_async_thread.t_run = opal_progress_async_thread_engine;
+        opal_progress_async_thread.t_arg = &thread_arg;
+
+        /* optimistic setting for asynchronism here, but we know that these might
+        still be changed by other *init() routines :( */
+        opal_progress_set_yield_when_idle(true);
+        opal_progress_set_event_flag(opal_progress_event_flag | OPAL_EVLOOP_NONBLOCK);
+
+        err = opal_thread_start(&opal_progress_async_thread);
+        if (OPAL_SUCCESS == err) {
+            opal_set_using_threads(true);
+            opal_async_progress_thread_spawned = true;
+        } else {
+            thread_arg.running = false;
+            OPAL_OUTPUT(
+                (debug_output, "progress: Failed to start async progress thread: err=%d", err));
+        }
+    }
+#endif
+
     OPAL_OUTPUT(
         (debug_output, "progress: initialized event flag to: %x", opal_progress_event_flag));
     OPAL_OUTPUT((debug_output, "progress: initialized yield_when_idle to: %s",
@@ -165,7 +248,7 @@ int opal_progress_init(void)
 
     opal_finalize_register_cleanup(opal_progress_finalize);
 
-    return OPAL_SUCCESS;
+    return err;
 }
 
 static int opal_progress_events(void)
@@ -213,7 +296,7 @@ static int opal_progress_events(void)
  * care, as the cost of that happening is far outweighed by the cost
  * of the if checks (they were resulting in bad pipe stalling behavior)
  */
-int opal_progress(void)
+static int _opal_progress(void)
 {
     static uint32_t num_calls = 0;
     size_t i;
@@ -224,13 +307,17 @@ int opal_progress(void)
         events += (callbacks[i])();
     }
 
-    /* Run low priority callbacks and events once every 8 calls to opal_progress().
+    /* Run low priority callbacks and events once every <N> calls to opal_progress().
      * Even though "num_calls" can be modified by multiple threads, we do not use
      * atomic operations here, for performance reasons. In case of a race, the
      * number of calls may be inaccurate, but since it will eventually be incremented,
      * it's not a problem.
+     * If opal_async_progress_thread_spawned == false, then N = 8
+     * otherwise let's pick N = 256 for the moment (George's recommendation) and
+     * adapt it later if it takes too many resources.
      */
-    if (((num_calls++) & 0x7) == 0) {
+    const uint32_t mod = opal_async_progress_thread_spawned ? 0xFF : 0x7;
+    if (((num_calls++) & mod) == 0) {
         for (i = 0; i < callbacks_lp_len; ++i) {
             events += (callbacks_lp[i])();
         }
@@ -254,6 +341,48 @@ int opal_progress(void)
     return events;
 }
 
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+static void *opal_progress_async_thread_engine(opal_object_t *obj)
+{
+    opal_thread_t *current_thread = (opal_thread_t *) obj;
+    thread_args_t *p_thread_arg = (thread_args_t *) current_thread->t_arg;
+
+    while (p_thread_arg->running) {
+        const int64_t new_events = _opal_progress();
+        if (new_events > 0) {
+            opal_atomic_add_fetch_64(&p_thread_arg->nb_events_reported, new_events);
+        }
+    }
+
+    return OPAL_THREAD_CANCELLED;
+}
+#endif
+
+int opal_progress(void)
+{
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    if (opal_async_progress_thread_spawned) {
+        /* async progress thread alongside may has processed new events,
+         * atomically read and reset nb_events_reported to zero.
+         */
+        const int64_t new_events = opal_atomic_swap_64(&thread_arg.nb_events_reported, 0);
+
+        /* if no new event, then application thread may yield here */
+        if (opal_progress_yield_when_idle && new_events <= 0) {
+            opal_thread_yield();
+        }
+        return new_events;
+    } else {
+#endif
+
+    /* no async progress thread, call the normal progress routine like before */
+    return _opal_progress();
+
+#if OPAL_ENABLE_PROGRESS_THREADS == 1
+    }
+#endif
+}
+
 int opal_progress_set_event_flag(int flag)
 {
     int tmp = opal_progress_event_flag;
diff --git a/opal/runtime/opal_progress.h b/opal/runtime/opal_progress.h
index 86a0f8add50..d6a782330d8 100644
--- a/opal/runtime/opal_progress.h
+++ b/opal/runtime/opal_progress.h
@@ -142,7 +142,7 @@ OPAL_DECLSPEC void opal_progress_set_event_poll_rate(int microseconds);
  *
  * Prototype for the a progress function callback.  Progress function
  * callbacks can be registered with opal_progress_register() and
- * deregistered with opal_progress_deregister().  It should be noted
+ * deregistered with opal_progress_unregister().  It should be noted
  * that either registering or deregistering a function callback is an
  * extraordinarily expensive operation and should not be used for
  * potentially short callback lifetimes.
@@ -195,6 +195,9 @@ static inline bool opal_progress_spin(volatile bool *complete)
     return false;
 }
 
+/* shutdown the async progress thread. Do nothing if disabled */
+OPAL_DECLSPEC void opal_progress_shutdown_async_progress_thread(void);
+
 END_C_DECLS
 
 #endif