From 5070d7f38efbaa59f637309e334e362ca321c3c3 Mon Sep 17 00:00:00 2001 From: Minh Quan Ho Date: Tue, 7 May 2024 10:52:21 +0200 Subject: [PATCH] ompi/instance: fix cleanup function registration order - Append PML cleanup into the finalize of the instance domain ('ompi_instance_common_domain') before RTE/OPAL init. - The reason is RTE init (ompi_rte_init()) will call opal_init(), which in turn will set the internal tracking domain to OPAL's one ('opal_init_domain'), and this PML cleanup function would be mis-registered as belonging to 'opal_init_domain' instead of the current 'ompi_instance_common_domain'. - The consequence of such mis-registration is that: at MPI_Finalize(), this PML cleanup (*_del_procs()) will be executed by RTE; and, depending on their registration order, this may cut the grass under the feet of other running components (*_progress()) - This may be the root cause of issue #10117 Signed-off-by: Minh Quan Ho --- ompi/instance/instance.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index adf2e8ace89..1576fd5e3a7 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -8,6 +8,7 @@ * reserved. * Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2025 SiPearl. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -381,6 +382,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) opal_finalize_domain_init (&ompi_instance_common_domain, "ompi_mpi_instance_init_common"); opal_finalize_set_domain (&ompi_instance_common_domain); + /* Append PML cleanup into the finalize of this domain ('ompi_instance_common_domain') + before RTE init */ + ompi_mpi_instance_append_finalize (ompi_mpi_instance_cleanup_pml); + if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) { return ompi_instance_print_error ("ompi_mpi_init: opal_arch_set_fortran_logical_size failed", ret); } @@ -638,8 +643,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv) return ompi_instance_print_error ("ompi_group_init() failed", ret); } - ompi_mpi_instance_append_finalize (ompi_mpi_instance_cleanup_pml); - /* initialize communicator subsystem */ if (OMPI_SUCCESS != (ret = ompi_comm_init ())) { opal_mutex_unlock (&instance_lock); @@ -906,8 +909,6 @@ static int ompi_mpi_instance_finalize_common (void) mca_mpool_base_tree_print (ompi_debug_show_mpi_alloc_mem_leaks); } - opal_finalize_cleanup_domain (&ompi_instance_common_domain); - if (NULL != ompi_mpi_main_thread) { OBJ_RELEASE(ompi_mpi_main_thread); ompi_mpi_main_thread = NULL; @@ -936,6 +937,9 @@ static int ompi_mpi_instance_finalize_common (void) ompi_rte_initialized = false; + /* Should be called in reverse order of init, i.e. after RTE finalize */ + opal_finalize_cleanup_domain (&ompi_instance_common_domain); + for (int i = 0 ; ompi_lazy_frameworks[i] ; ++i) { if (0 < ompi_lazy_frameworks[i]->framework_refcnt) { /* May have been "opened" multiple times. We want it closed now! */