Skip to content

Commit 94a56ab

Browse files
committed
coll/accelerator: add bcast,allgather,alltoall
add support for bcast, allgather and alltoall for device buffers using a temporary buffer on the CPU. The maximum msg length for each operation for which to use this approach can be controlled through an mca parameter. Note, for allgather and alltoall, the parameter represents the product of communicator size * msg length per proc. Signed-off-by: Edgar Gabriel <[email protected]>
1 parent 70a4ea3 commit 94a56ab

7 files changed

+352
-2
lines changed

ompi/mca/coll/accelerator/Makefile.am

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@
1111
# $HEADER$
1212
#
1313

14-
sources = coll_accelerator_module.c coll_accelerator_reduce.c coll_accelerator_allreduce.c \
14+
sources = coll_accelerator_reduce.c coll_accelerator_allreduce.c \
1515
coll_accelerator_reduce_scatter_block.c coll_accelerator_reduce_scatter.c \
16-
coll_accelerator_component.c \
16+
coll_accelerator_allgather.c coll_accelerator_alltoall.c coll_accelerator_bcast.c \
17+
coll_accelerator_component.c coll_accelerator_module.c \
1718
coll_accelerator_scan.c coll_accelerator_exscan.c coll_accelerator.h
1819

1920
# Make the output library in this directory, and name it either

ompi/mca/coll/accelerator/coll_accelerator.h

+27
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ BEGIN_C_DECLS
3434

3535
/* API functions */
3636

37+
38+
extern int mca_coll_accelerator_bcast_thresh;
39+
extern int mca_coll_accelerator_allgather_thresh;
40+
extern int mca_coll_accelerator_alltoall_thresh;
41+
3742
int mca_coll_accelerator_init_query(bool enable_progress_threads,
3843
bool enable_mpi_threads);
3944
mca_coll_base_module_t
@@ -85,6 +90,28 @@ mca_coll_accelerator_reduce_scatter(const void *sbuf, void *rbuf, ompi_count_arr
8590
struct ompi_communicator_t *comm,
8691
mca_coll_base_module_t *module);
8792

93+
int
94+
mca_coll_accelerator_allgather(const void *sbuf, size_t scount,
95+
struct ompi_datatype_t *sdtype,
96+
void *rbuf, size_t rcount,
97+
struct ompi_datatype_t *rdtype,
98+
struct ompi_communicator_t *comm,
99+
mca_coll_base_module_t *module);
100+
101+
int
102+
mca_coll_accelerator_alltoall(const void *sbuf, size_t scount,
103+
struct ompi_datatype_t *sdtype,
104+
void *rbuf, size_t rcount,
105+
struct ompi_datatype_t *rdtype,
106+
struct ompi_communicator_t *comm,
107+
mca_coll_base_module_t *module);
108+
109+
int
110+
mca_coll_accelerator_bcast(void *buff, size_t count,
111+
struct ompi_datatype_t *datatype,
112+
int root,
113+
struct ompi_communicator_t *comm,
114+
mca_coll_base_module_t *module);
88115

89116
/* Checks the type of pointer
90117
*
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
* Copyright (c) 2014-2017 The University of Tennessee and The University
3+
* of Tennessee Research Foundation. All rights
4+
* reserved.
5+
* Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved.
6+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
7+
* Copyright (c) 2024 Triad National Security, LLC. All rights reserved.
8+
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
9+
* $COPYRIGHT$
10+
*
11+
* Additional copyrights may follow
12+
*
13+
* $HEADER$
14+
*/
15+
16+
#include "ompi_config.h"
17+
#include "coll_accelerator.h"
18+
19+
#include <stdio.h>
20+
21+
#include "ompi/op/op.h"
22+
#include "opal/datatype/opal_convertor.h"
23+
24+
/*
25+
* allreduce_intra
26+
*
27+
* Function: - allgather for device buffers through temp CPU buffer
28+
* Accepts: - same as MPI_Allgather()
29+
* Returns: - MPI_SUCCESS or error code
30+
*/
31+
int
32+
mca_coll_accelerator_allgather(const void *sbuf, size_t scount,
33+
struct ompi_datatype_t *sdtype,
34+
void *rbuf, size_t rcount,
35+
struct ompi_datatype_t *rdtype,
36+
struct ompi_communicator_t *comm,
37+
mca_coll_base_module_t *module)
38+
{
39+
mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module;
40+
ptrdiff_t sgap, rgap;
41+
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
42+
int sbuf_dev, rbuf_dev;
43+
size_t sbufsize, rbufsize;
44+
int rc;
45+
int comm_size = ompi_comm_size(comm);
46+
47+
sbufsize = opal_datatype_span(&sdtype->super, scount, &sgap);
48+
rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev);
49+
if (rc < 0) {
50+
return rc;
51+
}
52+
if ((MPI_IN_PLACE != sbuf) && (rc > 0) &&
53+
((sbufsize * comm_size) <= (size_t)mca_coll_accelerator_allgather_thresh)) {
54+
sbuf1 = (char*)malloc(sbufsize * comm_size);
55+
if (NULL == sbuf1) {
56+
return OMPI_ERR_OUT_OF_RESOURCE;
57+
}
58+
mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev,
59+
sbufsize, MCA_ACCELERATOR_TRANSFER_DTOH);
60+
sbuf = sbuf1 - sgap;
61+
}
62+
63+
rbufsize = opal_datatype_span(&rdtype->super, rcount, &rgap);
64+
rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev);
65+
if (rc < 0) {
66+
goto exit;
67+
}
68+
/* Using sbufsize here on purpose to ensure symmetric decision for handling of GPU vs
69+
CPU buffers. The two buffer sizes are expected to be the same for pre-defined datatypes,
70+
but could vary due to layout issues/gaps for derived datatypes */
71+
if ((rc > 0) && ((sbufsize * comm_size) <= (size_t)mca_coll_accelerator_allgather_thresh)) {
72+
rbuf1 = (char*)malloc(rbufsize * comm_size);
73+
if (NULL == rbuf1) {
74+
rc = OMPI_ERR_OUT_OF_RESOURCE;
75+
goto exit;
76+
}
77+
mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev,
78+
rbufsize, MCA_ACCELERATOR_TRANSFER_DTOH);
79+
rbuf2 = rbuf; /* save original buffer */
80+
rbuf = rbuf1 - rgap;
81+
}
82+
rc = s->c_coll.coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
83+
comm, s->c_coll.coll_allgather_module);
84+
if (rc < 0) {
85+
goto exit;
86+
}
87+
88+
if (NULL != rbuf1) {
89+
mca_coll_accelerator_memcpy(rbuf2, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbufsize,
90+
MCA_ACCELERATOR_TRANSFER_HTOD);
91+
}
92+
93+
exit:
94+
if (NULL != sbuf1) {
95+
free(sbuf1);
96+
}
97+
if (NULL != rbuf1) {
98+
free(rbuf1);
99+
}
100+
101+
return rc;
102+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/*
2+
* Copyright (c) 2014-2017 The University of Tennessee and The University
3+
* of Tennessee Research Foundation. All rights
4+
* reserved.
5+
* Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved.
6+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
7+
* Copyright (c) 2024 Triad National Security, LLC. All rights reserved.
8+
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
9+
* $COPYRIGHT$
10+
*
11+
* Additional copyrights may follow
12+
*
13+
* $HEADER$
14+
*/
15+
16+
#include "ompi_config.h"
17+
#include "coll_accelerator.h"
18+
19+
#include <stdio.h>
20+
21+
#include "ompi/op/op.h"
22+
#include "opal/datatype/opal_convertor.h"
23+
24+
/*
25+
* allreduce_intra
26+
*
27+
* Function: - alltoall for device buffers using temp. CPU buffer
28+
* Accepts: - same as MPI_Alltoall()
29+
* Returns: - MPI_SUCCESS or error code
30+
*/
31+
int
32+
mca_coll_accelerator_alltoall(const void *sbuf, size_t scount,
33+
struct ompi_datatype_t *sdtype,
34+
void *rbuf, size_t rcount,
35+
struct ompi_datatype_t *rdtype,
36+
struct ompi_communicator_t *comm,
37+
mca_coll_base_module_t *module)
38+
{
39+
mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module;
40+
ptrdiff_t sgap, rgap;
41+
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
42+
int sbuf_dev, rbuf_dev;
43+
size_t sbufsize, rbufsize;
44+
int rc;
45+
int comm_size = ompi_comm_size(comm);
46+
47+
sbufsize = opal_datatype_span(&sdtype->super, scount, &sgap);
48+
rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev);
49+
if (rc < 0) {
50+
return rc;
51+
}
52+
if ((MPI_IN_PLACE != sbuf) && (rc > 0) &&
53+
((sbufsize * comm_size) <= (size_t)mca_coll_accelerator_alltoall_thresh)) {
54+
sbuf1 = (char*)malloc(sbufsize * comm_size);
55+
if (NULL == sbuf1) {
56+
return OMPI_ERR_OUT_OF_RESOURCE;
57+
}
58+
mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev,
59+
sbufsize, MCA_ACCELERATOR_TRANSFER_DTOH);
60+
sbuf = sbuf1 - sgap;
61+
}
62+
63+
rbufsize = opal_datatype_span(&rdtype->super, rcount, &rgap);
64+
rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev);
65+
if (rc < 0) {
66+
goto exit;;
67+
}
68+
/* Using sbufsize here on purpose to ensure symmetric decision for handling of GPU vs
69+
CPU buffers. The two buffer sizes are expected to be the same for pre-defined datatypes,
70+
but could vary due to layout issues/gaps for derived datatypes */
71+
if ((rc > 0) && ((sbufsize * comm_size) <= (size_t)mca_coll_accelerator_alltoall_thresh)) {
72+
rbuf1 = (char*)malloc(rbufsize * comm_size);
73+
if (NULL == rbuf1) {
74+
rc = OMPI_ERR_OUT_OF_RESOURCE;
75+
goto exit;
76+
}
77+
mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev,
78+
rbufsize, MCA_ACCELERATOR_TRANSFER_DTOH);
79+
rbuf2 = rbuf; /* save away original buffer */
80+
rbuf = rbuf1 - rgap;
81+
}
82+
rc = s->c_coll.coll_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype,
83+
comm, s->c_coll.coll_alltoall_module);
84+
if (rc < 0) {
85+
goto exit;;
86+
}
87+
if (NULL != rbuf1) {
88+
mca_coll_accelerator_memcpy(rbuf2, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbufsize,
89+
MCA_ACCELERATOR_TRANSFER_HTOD);
90+
}
91+
92+
exit:
93+
if (NULL != sbuf1) {
94+
free(sbuf1);
95+
}
96+
if (NULL != rbuf1) {
97+
free(rbuf1);
98+
}
99+
100+
return rc;
101+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
3+
* Copyright (c) 2004-2023 The University of Tennessee and The University
4+
* of Tennessee Research Foundation. All rights
5+
* reserved.
6+
* Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved.
7+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
8+
* Copyright (c) 2024 Triad National Security, LLC. All rights reserved.
9+
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
10+
* $COPYRIGHT$
11+
*
12+
* Additional copyrights may follow
13+
*
14+
* $HEADER$
15+
*/
16+
17+
#include "ompi_config.h"
18+
#include "coll_accelerator.h"
19+
20+
#include <stdio.h>
21+
22+
#include "ompi/op/op.h"
23+
#include "opal/datatype/opal_convertor.h"
24+
25+
/*
26+
*
27+
* Function: - Bcast for device buffers through temp CPU buffer.
28+
* Accepts: - same as MPI_Bcast()
29+
* Returns: - MPI_SUCCESS or error code
30+
*/
31+
int
32+
mca_coll_accelerator_bcast(void *orig_buf, size_t count,
33+
struct ompi_datatype_t *datatype,
34+
int root,
35+
struct ompi_communicator_t *comm,
36+
mca_coll_base_module_t *module)
37+
{
38+
mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module;
39+
ptrdiff_t gap;
40+
char *buf1 = NULL;
41+
char *sbuf = (char*) orig_buf;
42+
int buf_dev;
43+
size_t bufsize;
44+
int rc;
45+
46+
bufsize = opal_datatype_span(&datatype->super, count, &gap);
47+
48+
rc = mca_coll_accelerator_check_buf((void *)orig_buf, &buf_dev);
49+
if (rc < 0) {
50+
return rc;
51+
}
52+
if ((rc > 0) && (count <= (size_t)mca_coll_accelerator_bcast_thresh)) {
53+
buf1 = (char*)malloc(bufsize);
54+
if (NULL == buf1) {
55+
return OMPI_ERR_OUT_OF_RESOURCE;
56+
}
57+
mca_coll_accelerator_memcpy(buf1, MCA_ACCELERATOR_NO_DEVICE_ID, orig_buf, buf_dev, bufsize,
58+
MCA_ACCELERATOR_TRANSFER_DTOH);
59+
sbuf = buf1 - gap;
60+
}
61+
62+
rc = s->c_coll.coll_bcast((void *) sbuf, count, datatype, root, comm,
63+
s->c_coll.coll_bcast_module);
64+
if (rc < 0) {
65+
goto exit;
66+
}
67+
if (NULL != buf1) {
68+
mca_coll_accelerator_memcpy((void*)orig_buf, buf_dev, buf1, MCA_ACCELERATOR_NO_DEVICE_ID, bufsize,
69+
MCA_ACCELERATOR_TRANSFER_HTOD);
70+
}
71+
72+
exit:
73+
if (NULL != buf1) {
74+
free(buf1);
75+
}
76+
77+
return rc;
78+
}

ompi/mca/coll/accelerator/coll_accelerator_component.c

+32
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
#include "ompi/constants.h"
2323
#include "coll_accelerator.h"
2424

25+
26+
int mca_coll_accelerator_bcast_thresh = 256;
27+
int mca_coll_accelerator_allgather_thresh = 65536;
28+
int mca_coll_accelerator_alltoall_thresh = 65536;
29+
2530
/*
2631
* Public string showing the coll ompi_accelerator component version number
2732
*/
@@ -88,5 +93,32 @@ static int accelerator_register(void)
8893
MCA_BASE_VAR_SCOPE_READONLY,
8994
&mca_coll_accelerator_component.disable_accelerator_coll);
9095

96+
mca_coll_accelerator_bcast_thresh = 256;
97+
(void) mca_base_component_var_register(&mca_coll_accelerator_component.super.collm_version,
98+
"bcast_thresh",
99+
"max. msg length for which to copy accelerator buffer to CPU for bcast operation",
100+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
101+
OPAL_INFO_LVL_9,
102+
MCA_BASE_VAR_SCOPE_READONLY,
103+
&mca_coll_accelerator_bcast_thresh);
104+
105+
mca_coll_accelerator_allgather_thresh = 65536;
106+
(void) mca_base_component_var_register(&mca_coll_accelerator_component.super.collm_version,
107+
"allgather_thresh",
108+
"max. overall msg length for which to copy accelerator buffer to CPU for allgather operation",
109+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
110+
OPAL_INFO_LVL_9,
111+
MCA_BASE_VAR_SCOPE_READONLY,
112+
&mca_coll_accelerator_allgather_thresh);
113+
114+
mca_coll_accelerator_alltoall_thresh = 65536;
115+
(void) mca_base_component_var_register(&mca_coll_accelerator_component.super.collm_version,
116+
"alltoall_thresh",
117+
"max. overall msg length for which to copy accelerator buffer to CPU for alltoall operation",
118+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
119+
OPAL_INFO_LVL_9,
120+
MCA_BASE_VAR_SCOPE_READONLY,
121+
&mca_coll_accelerator_alltoall_thresh);
122+
91123
return OMPI_SUCCESS;
92124
}

0 commit comments

Comments
 (0)