|
20 | 20 | * Copyright (c) 2014-2020 Research Organization for Information Science
|
21 | 21 | * and Technology (RIST). All rights reserved.
|
22 | 22 | * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
|
23 |
| - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. |
| 23 | + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. |
24 | 24 | * Copyright (c) 2018-2022 Triad National Security, LLC. All rights
|
25 | 25 | * reserved.
|
26 | 26 | * Copyright (c) 2022 IBM Corporation. All rights reserved.
|
@@ -110,7 +110,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
|
110 | 110 | opal_process_name_t pname;
|
111 | 111 | opal_list_t ilist, mlist, rlist;
|
112 | 112 | pmix_info_t info, tinfo;
|
113 |
| - pmix_value_t pval; |
114 | 113 | pmix_pdata_t pdat;
|
115 | 114 | pmix_proc_t *procs, pxproc;
|
116 | 115 | size_t nprocs, n;
|
@@ -394,86 +393,45 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
|
394 | 393 | goto exit;
|
395 | 394 | }
|
396 | 395 | if (!opal_list_is_empty(&ilist)) {
|
397 |
| - int prn, nprn = 0; |
398 |
| - char *val; |
399 | 396 | opal_process_name_t wildcard_rank;
|
400 |
| - i = 0; /* start from the begining */ |
401 | 397 |
|
402 | 398 | /* convert the list of new procs to a proc_t array */
|
403 |
| - new_proc_list = (ompi_proc_t**)calloc(opal_list_get_size(&ilist), |
404 |
| - sizeof(ompi_proc_t *)); |
405 |
| - /* Extract the modex info for the first proc on the ilist, and then |
406 |
| - * remove all processors in the same jobid from the list by getting |
407 |
| - * their connection information and moving them into the proc array. |
408 |
| - */ |
| 399 | + size = opal_list_get_size(&ilist); |
| 400 | + new_proc_list = (ompi_proc_t**)calloc(size, sizeof(ompi_proc_t *)); |
| 401 | + // put the procs in the array, but order them by jobid so that |
| 402 | + // all members of the same jobid are sequential |
| 403 | + i = 0; |
409 | 404 | do {
|
410 |
| - uint32_t *local_ranks_in_jobid = NULL; |
411 | 405 | ompi_dpm_proct_caddy_t* next = NULL;
|
412 | 406 | cd = (ompi_dpm_proct_caddy_t*)opal_list_get_first(&ilist);
|
413 | 407 | proc = cd->p;
|
414 | 408 | wildcard_rank.jobid = proc->super.proc_name.jobid;
|
415 |
| - wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid; |
416 |
| - /* retrieve the local peers for the specified jobid */ |
417 |
| - OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCAL_PEERS, |
418 |
| - &wildcard_rank, &val, PMIX_STRING); |
419 |
| - if (OPAL_SUCCESS == rc && NULL != val) { |
420 |
| - char **peers = opal_argv_split(val, ','); |
421 |
| - free(val); |
422 |
| - nprn = opal_argv_count(peers); |
423 |
| - local_ranks_in_jobid = (uint32_t*)calloc(nprn, sizeof(uint32_t)); |
424 |
| - for (prn = 0; NULL != peers[prn]; prn++) { |
425 |
| - local_ranks_in_jobid[prn] = strtoul(peers[prn], NULL, 10); |
426 |
| - } |
427 |
| - opal_argv_free(peers); |
428 |
| - } |
429 |
| - |
430 | 409 | OPAL_LIST_FOREACH_SAFE(cd, next, &ilist, ompi_dpm_proct_caddy_t) {
|
431 | 410 | proc = cd->p;
|
432 |
| - if( proc->super.proc_name.jobid != wildcard_rank.jobid ) |
| 411 | + if (proc->super.proc_name.jobid != wildcard_rank.jobid) { |
433 | 412 | continue; /* not a proc from this jobid */
|
434 |
| - |
| 413 | + } |
| 414 | + // check name setup and set arch |
| 415 | + ompi_proc_complete_init_single(proc); |
435 | 416 | new_proc_list[i] = proc;
|
| 417 | + ++i; |
436 | 418 | opal_list_remove_item(&ilist, (opal_list_item_t*)cd); // TODO: do we need to release cd ?
|
437 | 419 | OBJ_RELEASE(cd);
|
438 |
| - /* ompi_proc_complete_init_single() initializes and optionally retrieves |
439 |
| - * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without |
440 |
| - * them, we are just fine */ |
441 |
| - ompi_proc_complete_init_single(proc); |
442 |
| - /* if this proc is local, then get its locality */ |
443 |
| - if (NULL != local_ranks_in_jobid) { |
444 |
| - uint16_t u16; |
445 |
| - for (prn=0; prn < nprn; prn++) { |
446 |
| - if (local_ranks_in_jobid[prn] == proc->super.proc_name.vpid) { |
447 |
| - /* get their locality string */ |
448 |
| - val = NULL; |
449 |
| - OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, |
450 |
| - &proc->super.proc_name, &val, PMIX_STRING); |
451 |
| - if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) { |
452 |
| - u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val); |
453 |
| - free(val); |
454 |
| - } else { |
455 |
| - /* all we can say is that it shares our node */ |
456 |
| - u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; |
457 |
| - } |
458 |
| - proc->super.proc_flags = u16; |
459 |
| - /* save the locality for later */ |
460 |
| - OPAL_PMIX_CONVERT_NAME(&pxproc, &proc->super.proc_name); |
461 |
| - pval.type = PMIX_UINT16; |
462 |
| - pval.data.uint16 = proc->super.proc_flags; |
463 |
| - PMIx_Store_internal(&pxproc, PMIX_LOCALITY, &pval); |
464 |
| - break; |
465 |
| - } |
466 |
| - } |
467 |
| - } |
468 |
| - ++i; |
469 |
| - } |
470 |
| - if (NULL != local_ranks_in_jobid) { |
471 |
| - free(local_ranks_in_jobid); |
472 | 420 | }
|
473 | 421 | } while (!opal_list_is_empty(&ilist));
|
474 | 422 |
|
| 423 | + // set locality for each proc |
| 424 | + rc = ompi_dpm_set_locality(new_proc_list, size); |
| 425 | + if (OPAL_SUCCESS != rc) { |
| 426 | + OMPI_ERROR_LOG(rc); |
| 427 | + free(new_proc_list); |
| 428 | + new_proc_list = NULL; |
| 429 | + OPAL_LIST_DESTRUCT(&ilist); |
| 430 | + goto exit; |
| 431 | + } |
| 432 | + |
475 | 433 | /* call add_procs on the new ones */
|
476 |
| - rc = MCA_PML_CALL(add_procs(new_proc_list, opal_list_get_size(&ilist))); |
| 434 | + rc = MCA_PML_CALL(add_procs(new_proc_list, size)); |
477 | 435 | free(new_proc_list);
|
478 | 436 | new_proc_list = NULL;
|
479 | 437 | if (OMPI_SUCCESS != rc) {
|
@@ -561,6 +519,93 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
|
561 | 519 | return rc;
|
562 | 520 | }
|
563 | 521 |
|
| 522 | +int ompi_dpm_set_locality(ompi_proc_t **procs, int rsize) |
| 523 | +{ |
| 524 | + pmix_nspace_t nspace; |
| 525 | + pmix_proc_t *local_procs = NULL; |
| 526 | + size_t nlocalprocs = 0; |
| 527 | + pmix_status_t rc; |
| 528 | + int i, ret; |
| 529 | + pmix_proc_t pproc; |
| 530 | + bool local; |
| 531 | + size_t m; |
| 532 | + uint16_t u16, *u16ptr = &u16; |
| 533 | + char *val; |
| 534 | + pmix_value_t pval; |
| 535 | + |
| 536 | + // lazy-execute the resolve - we may not need to do it and |
| 537 | + // it is an expensive operation since it must go to the |
| 538 | + // local server if we aren't a singleton |
| 539 | + |
| 540 | + /* set the locality of the remote procs */ |
| 541 | + for (i=0; i < rsize; i++) { |
| 542 | + OPAL_PMIX_CONVERT_NAME(&pproc, &procs[i]->super.proc_name); |
| 543 | + |
| 544 | + // first check to see if the locality is available - do |
| 545 | + // this as an "optional" check. It could be we previously |
| 546 | + // computed locality for this proc, and the check is fast |
| 547 | + // since it only is done locally. |
| 548 | + OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, PMIX_LOCALITY, |
| 549 | + &procs[i]->super.proc_name, &u16ptr, PMIX_UINT16); |
| 550 | + if (OPAL_SUCCESS == ret) { |
| 551 | + procs[i]->super.proc_flags = u16; |
| 552 | + continue; |
| 553 | + } |
| 554 | + |
| 555 | + // if we didn't find it, then we have to actually compute |
| 556 | + // the locality for this proc. check to see if we have |
| 557 | + // already resolved the peers - if not, then do so |
| 558 | + if (NULL == local_procs) { |
| 559 | + /* get the local procs - need all local procs since |
| 560 | + * we may have multiple namespaces involved */ |
| 561 | + PMIx_Load_nspace(nspace, NULL); |
| 562 | + rc = PMIx_Resolve_peers(NULL, nspace, &local_procs, &nlocalprocs); |
| 563 | + if (PMIX_SUCCESS != rc) { |
| 564 | + return OMPI_ERROR; |
| 565 | + } |
| 566 | + } |
| 567 | + |
| 568 | + /* see if this process is local to this node */ |
| 569 | + local = false; |
| 570 | + for (m=0; m < nlocalprocs; m++) { |
| 571 | + if (PMIX_CHECK_PROCID(&local_procs[m], &pproc)) { |
| 572 | + // this is a local process |
| 573 | + local = true; |
| 574 | + break; |
| 575 | + } |
| 576 | + } |
| 577 | + if (!local) { |
| 578 | + // this proc is not on the same node as us |
| 579 | + procs[i]->super.proc_flags = OPAL_PROC_NON_LOCAL; |
| 580 | + continue; |
| 581 | + } |
| 582 | + |
| 583 | + /* get the locality information - all RTEs are required |
| 584 | + * to provide this information at startup. However, note |
| 585 | + * that locality is ONLY defined for procs that are BOUND |
| 586 | + * as it requires that a proc be in a known location! */ |
| 587 | + val = NULL; |
| 588 | + OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, |
| 589 | + &procs[i]->super.proc_name, &val, PMIX_STRING); |
| 590 | + if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) { |
| 591 | + u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val); |
| 592 | + free(val); |
| 593 | + } else { |
| 594 | + /* all we can say is that it shares our node */ |
| 595 | + u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; |
| 596 | + } |
| 597 | + procs[i]->super.proc_flags = u16; |
| 598 | + /* save the locality for later */ |
| 599 | + pval.type = PMIX_UINT16; |
| 600 | + pval.data.uint16 = procs[i]->super.proc_flags; |
| 601 | + PMIx_Store_internal(&pproc, PMIX_LOCALITY, &pval); |
| 602 | + } |
| 603 | + if (NULL != local_procs) { |
| 604 | + PMIX_PROC_FREE(local_procs, nlocalprocs); |
| 605 | + } |
| 606 | + return OMPI_SUCCESS; |
| 607 | +} |
| 608 | + |
564 | 609 | static int construct_peers(ompi_group_t *group, opal_list_t *peers)
|
565 | 610 | {
|
566 | 611 | int i;
|
|
0 commit comments