Skip to content

Commit a8d12c7

Browse files
committed
Ensure we correctly identify local vs non-local peers
PMIX_LOCALITY is a value that is computed by OMPI when we do connect/accept - it is computed in opal_hwloc_compute_relative_locality and the value is locally stored on each proc. The reason is that PMIX_LOCALITY provides the location of a process relative to you - it isn't an absolute value representing the location of the process on the node. The absolute location of the proc is provided by the runtime in PMIX_LOCALITY_STRING. This is what was retrieved in dpm.c - and then used to compute the relative locality of that proc, which is then stored as PMIX_LOCALITY. So the reason procs from two unconnected jobs aren't able to get each others PMIX_LOCALITY values is simply because (a) they didn't go thru connect/accept, and therefore (b) they never computed and saved those values. Second, the runtime provides PMIX_LOCALITY_STRING only for those procs that have a defined location - i.e., procs that are BOUND. If a process is not bound, then it has no fixed location on the node, and so the runtime doesn't provide a locality string for it. Thus, getting "not found" for a modex retrieval on PMIX_LOCALITY_STRING is NOT a definitive indicator that the proc is on a different node. The only way to determine that a proc is on a different node is to get the list (or array) of procs on the node and see if the proc is on it. We do this in the dpm, but that step was missing from the comm code. So what I've done here is create a new function ompi_dpm_set_locality that both connect/accept and get_rprocs can use since the required functionality is identical. This will hopefully avoid similar mistakes in the future. Signed-off-by: Ralph Castain <[email protected]>
1 parent b79b3e9 commit a8d12c7

File tree

5 files changed

+118
-67
lines changed

5 files changed

+118
-67
lines changed

Diff for: .gitmodules

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[submodule "prrte"]
22
path = 3rd-party/prrte
3-
url = ../../open-mpi/prrte
3+
url = ../../openpmix/prrte.git
44
branch = master
55
[submodule "openpmix"]
66
path = 3rd-party/openpmix

Diff for: 3rd-party/openpmix

Submodule openpmix updated 214 files

Diff for: 3rd-party/prrte

Submodule prrte updated 247 files

Diff for: ompi/dpm/dpm.c

+109-64
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
* Copyright (c) 2014-2020 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
2222
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
23-
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
23+
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
2424
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
2525
* reserved.
2626
* Copyright (c) 2022 IBM Corporation. All rights reserved.
@@ -110,7 +110,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
110110
opal_process_name_t pname;
111111
opal_list_t ilist, mlist, rlist;
112112
pmix_info_t info, tinfo;
113-
pmix_value_t pval;
114113
pmix_pdata_t pdat;
115114
pmix_proc_t *procs, pxproc;
116115
size_t nprocs, n;
@@ -394,86 +393,45 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
394393
goto exit;
395394
}
396395
if (!opal_list_is_empty(&ilist)) {
397-
int prn, nprn = 0;
398-
char *val;
399396
opal_process_name_t wildcard_rank;
400-
i = 0; /* start from the begining */
401397

402398
/* convert the list of new procs to a proc_t array */
403-
new_proc_list = (ompi_proc_t**)calloc(opal_list_get_size(&ilist),
404-
sizeof(ompi_proc_t *));
405-
/* Extract the modex info for the first proc on the ilist, and then
406-
* remove all processors in the same jobid from the list by getting
407-
* their connection information and moving them into the proc array.
408-
*/
399+
size = opal_list_get_size(&ilist);
400+
new_proc_list = (ompi_proc_t**)calloc(size, sizeof(ompi_proc_t *));
401+
// put the procs in the array, but order them by jobid so that
402+
// all members of the same jobid are sequential
403+
i = 0;
409404
do {
410-
uint32_t *local_ranks_in_jobid = NULL;
411405
ompi_dpm_proct_caddy_t* next = NULL;
412406
cd = (ompi_dpm_proct_caddy_t*)opal_list_get_first(&ilist);
413407
proc = cd->p;
414408
wildcard_rank.jobid = proc->super.proc_name.jobid;
415-
wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid;
416-
/* retrieve the local peers for the specified jobid */
417-
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCAL_PEERS,
418-
&wildcard_rank, &val, PMIX_STRING);
419-
if (OPAL_SUCCESS == rc && NULL != val) {
420-
char **peers = opal_argv_split(val, ',');
421-
free(val);
422-
nprn = opal_argv_count(peers);
423-
local_ranks_in_jobid = (uint32_t*)calloc(nprn, sizeof(uint32_t));
424-
for (prn = 0; NULL != peers[prn]; prn++) {
425-
local_ranks_in_jobid[prn] = strtoul(peers[prn], NULL, 10);
426-
}
427-
opal_argv_free(peers);
428-
}
429-
430409
OPAL_LIST_FOREACH_SAFE(cd, next, &ilist, ompi_dpm_proct_caddy_t) {
431410
proc = cd->p;
432-
if( proc->super.proc_name.jobid != wildcard_rank.jobid )
411+
if (proc->super.proc_name.jobid != wildcard_rank.jobid) {
433412
continue; /* not a proc from this jobid */
434-
413+
}
414+
// check name setup and set arch
415+
ompi_proc_complete_init_single(proc);
435416
new_proc_list[i] = proc;
417+
++i;
436418
opal_list_remove_item(&ilist, (opal_list_item_t*)cd); // TODO: do we need to release cd ?
437419
OBJ_RELEASE(cd);
438-
/* ompi_proc_complete_init_single() initializes and optionally retrieves
439-
* OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without
440-
* them, we are just fine */
441-
ompi_proc_complete_init_single(proc);
442-
/* if this proc is local, then get its locality */
443-
if (NULL != local_ranks_in_jobid) {
444-
uint16_t u16;
445-
for (prn=0; prn < nprn; prn++) {
446-
if (local_ranks_in_jobid[prn] == proc->super.proc_name.vpid) {
447-
/* get their locality string */
448-
val = NULL;
449-
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING,
450-
&proc->super.proc_name, &val, PMIX_STRING);
451-
if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) {
452-
u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val);
453-
free(val);
454-
} else {
455-
/* all we can say is that it shares our node */
456-
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
457-
}
458-
proc->super.proc_flags = u16;
459-
/* save the locality for later */
460-
OPAL_PMIX_CONVERT_NAME(&pxproc, &proc->super.proc_name);
461-
pval.type = PMIX_UINT16;
462-
pval.data.uint16 = proc->super.proc_flags;
463-
PMIx_Store_internal(&pxproc, PMIX_LOCALITY, &pval);
464-
break;
465-
}
466-
}
467-
}
468-
++i;
469-
}
470-
if (NULL != local_ranks_in_jobid) {
471-
free(local_ranks_in_jobid);
472420
}
473421
} while (!opal_list_is_empty(&ilist));
474422

423+
// set locality for each proc
424+
rc = ompi_dpm_set_locality(new_proc_list, size);
425+
if (OPAL_SUCCESS != rc) {
426+
OMPI_ERROR_LOG(rc);
427+
free(new_proc_list);
428+
new_proc_list = NULL;
429+
OPAL_LIST_DESTRUCT(&ilist);
430+
goto exit;
431+
}
432+
475433
/* call add_procs on the new ones */
476-
rc = MCA_PML_CALL(add_procs(new_proc_list, opal_list_get_size(&ilist)));
434+
rc = MCA_PML_CALL(add_procs(new_proc_list, size));
477435
free(new_proc_list);
478436
new_proc_list = NULL;
479437
if (OMPI_SUCCESS != rc) {
@@ -561,6 +519,93 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
561519
return rc;
562520
}
563521

522+
int ompi_dpm_set_locality(ompi_proc_t **procs, int rsize)
523+
{
524+
pmix_nspace_t nspace;
525+
pmix_proc_t *local_procs = NULL;
526+
size_t nlocalprocs = 0;
527+
pmix_status_t rc;
528+
int i, ret;
529+
pmix_proc_t pproc;
530+
bool local;
531+
size_t m;
532+
uint16_t u16, *u16ptr = &u16;
533+
char *val;
534+
pmix_value_t pval;
535+
536+
// lazy-execute the resolve - we may not need to do it and
537+
// it is an expensive operation since it must go to the
538+
// local server if we aren't a singleton
539+
540+
/* set the locality of the remote procs */
541+
for (i=0; i < rsize; i++) {
542+
OPAL_PMIX_CONVERT_NAME(&pproc, &procs[i]->super.proc_name);
543+
544+
// first check to see if the locality is available - do
545+
// this as an "optional" check. It could be we previously
546+
// computed locality for this proc, and the check is fast
547+
// since it only is done locally.
548+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_LOCALITY,
549+
&procs[i]->super.proc_name, &u16ptr, PMIX_UINT16);
550+
if (OPAL_SUCCESS == ret) {
551+
procs[i]->super.proc_flags = u16;
552+
continue;
553+
}
554+
555+
// if we didn't find it, then we have to actually compute
556+
// the locality for this proc. check to see if we have
557+
// already resolved the peers - if not, then do so
558+
if (NULL == local_procs) {
559+
/* get the local procs - need all local procs since
560+
* we may have multiple namespaces involved */
561+
PMIx_Load_nspace(nspace, NULL);
562+
rc = PMIx_Resolve_peers(NULL, nspace, &local_procs, &nlocalprocs);
563+
if (PMIX_SUCCESS != rc) {
564+
return OMPI_ERROR;
565+
}
566+
}
567+
568+
/* see if this process is local to this node */
569+
local = false;
570+
for (m=0; m < nlocalprocs; m++) {
571+
if (PMIX_CHECK_PROCID(&local_procs[m], &pproc)) {
572+
// this is a local process
573+
local = true;
574+
break;
575+
}
576+
}
577+
if (!local) {
578+
// this proc is not on the same node as us
579+
procs[i]->super.proc_flags = OPAL_PROC_NON_LOCAL;
580+
continue;
581+
}
582+
583+
/* get the locality information - all RTEs are required
584+
* to provide this information at startup. However, note
585+
* that locality is ONLY defined for procs that are BOUND
586+
* as it requires that a proc be in a known location! */
587+
val = NULL;
588+
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING,
589+
&procs[i]->super.proc_name, &val, PMIX_STRING);
590+
if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) {
591+
u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val);
592+
free(val);
593+
} else {
594+
/* all we can say is that it shares our node */
595+
u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
596+
}
597+
procs[i]->super.proc_flags = u16;
598+
/* save the locality for later */
599+
pval.type = PMIX_UINT16;
600+
pval.data.uint16 = procs[i]->super.proc_flags;
601+
PMIx_Store_internal(&pproc, PMIX_LOCALITY, &pval);
602+
}
603+
if (NULL != local_procs) {
604+
PMIX_PROC_FREE(local_procs, nlocalprocs);
605+
}
606+
return OMPI_SUCCESS;
607+
}
608+
564609
static int construct_peers(ompi_group_t *group, opal_list_t *peers)
565610
{
566611
int i;

Diff for: ompi/dpm/dpm.h

+6
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* reserved.
1616
* Copyright (c) 2018 Triad National Security, LLC. All rights
1717
* reserved.
18+
* Copyright (c) 2025 Nanook Consulting All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -98,6 +99,11 @@ int ompi_dpm_open_port(char *port_name);
9899
*/
99100
int ompi_dpm_close_port(const char *port_name);
100101

102+
/*
103+
* Compute locality for array of procs
104+
*/
105+
int ompi_dpm_set_locality(ompi_proc_t **procs, int rsize);
106+
101107
END_C_DECLS
102108

103109
#endif /* OMPI_DPM_H */

0 commit comments

Comments
 (0)