Skip to content

Commit c5e02ab

Browse files
authored
Merge pull request #13006 from edgargabriel/topic/coll-accelerator-new-funcs
coll/accelerator add support for more functions
2 parents b5c94df + 8731f21 commit c5e02ab

9 files changed

+470
-5
lines changed

ompi/mca/coll/accelerator/Makefile.am

+4-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
# $HEADER$
1212
#
1313

14-
sources = coll_accelerator_module.c coll_accelerator_reduce.c coll_accelerator_allreduce.c \
15-
coll_accelerator_reduce_scatter_block.c coll_accelerator_component.c \
14+
sources = coll_accelerator_reduce.c coll_accelerator_allreduce.c \
15+
coll_accelerator_reduce_scatter_block.c coll_accelerator_reduce_scatter.c \
16+
coll_accelerator_allgather.c coll_accelerator_alltoall.c coll_accelerator_bcast.c \
17+
coll_accelerator_component.c coll_accelerator_module.c \
1618
coll_accelerator_scan.c coll_accelerator_exscan.c coll_accelerator.h
1719

1820
# Make the output library in this directory, and name it either

ompi/mca/coll/accelerator/coll_accelerator.h

+34
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ BEGIN_C_DECLS
3434

3535
/* API functions */
3636

37+
38+
extern int mca_coll_accelerator_bcast_thresh;
39+
extern int mca_coll_accelerator_allgather_thresh;
40+
extern int mca_coll_accelerator_alltoall_thresh;
41+
3742
int mca_coll_accelerator_init_query(bool enable_progress_threads,
3843
bool enable_mpi_threads);
3944
mca_coll_base_module_t
@@ -78,6 +83,35 @@ mca_coll_accelerator_reduce_scatter_block(const void *sbuf, void *rbuf, size_t r
7883
struct ompi_communicator_t *comm,
7984
mca_coll_base_module_t *module);
8085

86+
int
87+
mca_coll_accelerator_reduce_scatter(const void *sbuf, void *rbuf, ompi_count_array_t rcounts,
88+
struct ompi_datatype_t *dtype,
89+
struct ompi_op_t *op,
90+
struct ompi_communicator_t *comm,
91+
mca_coll_base_module_t *module);
92+
93+
int
94+
mca_coll_accelerator_allgather(const void *sbuf, size_t scount,
95+
struct ompi_datatype_t *sdtype,
96+
void *rbuf, size_t rcount,
97+
struct ompi_datatype_t *rdtype,
98+
struct ompi_communicator_t *comm,
99+
mca_coll_base_module_t *module);
100+
101+
int
102+
mca_coll_accelerator_alltoall(const void *sbuf, size_t scount,
103+
struct ompi_datatype_t *sdtype,
104+
void *rbuf, size_t rcount,
105+
struct ompi_datatype_t *rdtype,
106+
struct ompi_communicator_t *comm,
107+
mca_coll_base_module_t *module);
108+
109+
int
110+
mca_coll_accelerator_bcast(void *buff, size_t count,
111+
struct ompi_datatype_t *datatype,
112+
int root,
113+
struct ompi_communicator_t *comm,
114+
mca_coll_base_module_t *module);
81115

82116
/* Checks the type of pointer
83117
*
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*
2+
* Copyright (c) 2014-2017 The University of Tennessee and The University
3+
* of Tennessee Research Foundation. All rights
4+
* reserved.
5+
* Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved.
6+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
7+
* Copyright (c) 2024 Triad National Security, LLC. All rights reserved.
8+
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
9+
* $COPYRIGHT$
10+
*
11+
* Additional copyrights may follow
12+
*
13+
* $HEADER$
14+
*/
15+
16+
#include "ompi_config.h"
17+
#include "coll_accelerator.h"
18+
19+
#include <stdio.h>
20+
21+
#include "ompi/op/op.h"
22+
#include "opal/datatype/opal_convertor.h"
23+
24+
/*
25+
* Function: - allgather for device buffers through temp CPU buffer
26+
* Accepts: - same as MPI_Allgather()
27+
* Returns: - MPI_SUCCESS or error code
28+
*/
29+
int
30+
mca_coll_accelerator_allgather(const void *sbuf, size_t scount,
31+
struct ompi_datatype_t *sdtype,
32+
void *rbuf, size_t rcount,
33+
struct ompi_datatype_t *rdtype,
34+
struct ompi_communicator_t *comm,
35+
mca_coll_base_module_t *module)
36+
{
37+
mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module;
38+
ptrdiff_t sgap, rgap;
39+
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
40+
int sbuf_dev, rbuf_dev;
41+
size_t sbufsize, rbufsize;
42+
int rc;
43+
int comm_size = ompi_comm_size(comm);
44+
45+
sbufsize = opal_datatype_span(&sdtype->super, scount, &sgap);
46+
rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev);
47+
if (rc < 0) {
48+
return rc;
49+
}
50+
if ((MPI_IN_PLACE != sbuf) && (rc > 0) &&
51+
(sbufsize <= (size_t)mca_coll_accelerator_allgather_thresh)) {
52+
sbuf1 = (char*)malloc(sbufsize);
53+
if (NULL == sbuf1) {
54+
return OMPI_ERR_OUT_OF_RESOURCE;
55+
}
56+
mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev,
57+
sbufsize, MCA_ACCELERATOR_TRANSFER_DTOH);
58+
sbuf = sbuf1 - sgap;
59+
}
60+
61+
rbufsize = opal_datatype_span(&rdtype->super, rcount, &rgap);
62+
rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev);
63+
if (rc < 0) {
64+
goto exit;
65+
}
66+
/* Using sbufsize here on purpose to ensure symmetric decision for handling of GPU vs
67+
CPU buffers. The two buffer sizes are expected to be the same for pre-defined datatypes,
68+
but could vary due to layout issues/gaps for derived datatypes */
69+
if ((rc > 0) && (sbufsize <= (size_t)mca_coll_accelerator_allgather_thresh)) {
70+
rbuf1 = (char*)malloc(rbufsize * comm_size);
71+
if (NULL == rbuf1) {
72+
rc = OMPI_ERR_OUT_OF_RESOURCE;
73+
goto exit;
74+
}
75+
mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev,
76+
rbufsize * comm_size, MCA_ACCELERATOR_TRANSFER_DTOH);
77+
rbuf2 = rbuf; /* save original buffer */
78+
rbuf = rbuf1 - rgap;
79+
}
80+
rc = s->c_coll.coll_allgather(sbuf, scount, sdtype, rbuf, rcount, rdtype,
81+
comm, s->c_coll.coll_allgather_module);
82+
if (rc < 0) {
83+
goto exit;
84+
}
85+
86+
if (NULL != rbuf1) {
87+
mca_coll_accelerator_memcpy(rbuf2, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbufsize * comm_size,
88+
MCA_ACCELERATOR_TRANSFER_HTOD);
89+
}
90+
91+
exit:
92+
if (NULL != sbuf1) {
93+
free(sbuf1);
94+
}
95+
if (NULL != rbuf1) {
96+
free(rbuf1);
97+
}
98+
99+
return rc;
100+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/*
2+
* Copyright (c) 2014-2017 The University of Tennessee and The University
3+
* of Tennessee Research Foundation. All rights
4+
* reserved.
5+
* Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved.
6+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
7+
* Copyright (c) 2024 Triad National Security, LLC. All rights reserved.
8+
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
9+
* $COPYRIGHT$
10+
*
11+
* Additional copyrights may follow
12+
*
13+
* $HEADER$
14+
*/
15+
16+
#include "ompi_config.h"
17+
#include "coll_accelerator.h"
18+
19+
#include <stdio.h>
20+
21+
#include "ompi/op/op.h"
22+
#include "opal/datatype/opal_convertor.h"
23+
24+
/*
25+
* Function: - alltoall for device buffers using temp. CPU buffer
26+
* Accepts: - same as MPI_Alltoall()
27+
* Returns: - MPI_SUCCESS or error code
28+
*/
29+
int
30+
mca_coll_accelerator_alltoall(const void *sbuf, size_t scount,
31+
struct ompi_datatype_t *sdtype,
32+
void *rbuf, size_t rcount,
33+
struct ompi_datatype_t *rdtype,
34+
struct ompi_communicator_t *comm,
35+
mca_coll_base_module_t *module)
36+
{
37+
mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module;
38+
ptrdiff_t sgap, rgap;
39+
char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
40+
int sbuf_dev, rbuf_dev;
41+
size_t sbufsize, rbufsize;
42+
int rc;
43+
int comm_size = ompi_comm_size(comm);
44+
45+
sbufsize = opal_datatype_span(&sdtype->super, scount, &sgap);
46+
rc = mca_coll_accelerator_check_buf((void *)sbuf, &sbuf_dev);
47+
if (rc < 0) {
48+
return rc;
49+
}
50+
if ((MPI_IN_PLACE != sbuf) && (rc > 0) &&
51+
(sbufsize <= (size_t)mca_coll_accelerator_alltoall_thresh)) {
52+
sbuf1 = (char*)malloc(sbufsize * comm_size);
53+
if (NULL == sbuf1) {
54+
return OMPI_ERR_OUT_OF_RESOURCE;
55+
}
56+
mca_coll_accelerator_memcpy(sbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, sbuf, sbuf_dev,
57+
sbufsize * comm_size, MCA_ACCELERATOR_TRANSFER_DTOH);
58+
sbuf = sbuf1 - sgap;
59+
}
60+
61+
rbufsize = opal_datatype_span(&rdtype->super, rcount, &rgap);
62+
rc = mca_coll_accelerator_check_buf(rbuf, &rbuf_dev);
63+
if (rc < 0) {
64+
goto exit;
65+
}
66+
/* Using sbufsize here on purpose to ensure symmetric decision for handling of GPU vs
67+
CPU buffers. The two buffer sizes are expected to be the same for pre-defined datatypes,
68+
but could vary due to layout issues/gaps for derived datatypes */
69+
if ((rc > 0) && (sbufsize <= (size_t)mca_coll_accelerator_alltoall_thresh)) {
70+
rbuf1 = (char*)malloc(rbufsize * comm_size);
71+
if (NULL == rbuf1) {
72+
rc = OMPI_ERR_OUT_OF_RESOURCE;
73+
goto exit;
74+
}
75+
mca_coll_accelerator_memcpy(rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID, rbuf, rbuf_dev,
76+
rbufsize * comm_size, MCA_ACCELERATOR_TRANSFER_DTOH);
77+
rbuf2 = rbuf; /* save away original buffer */
78+
rbuf = rbuf1 - rgap;
79+
}
80+
rc = s->c_coll.coll_alltoall(sbuf, scount, sdtype, rbuf, rcount, rdtype,
81+
comm, s->c_coll.coll_alltoall_module);
82+
if (rc < 0) {
83+
goto exit;;
84+
}
85+
if (NULL != rbuf1) {
86+
mca_coll_accelerator_memcpy(rbuf2, rbuf_dev, rbuf1, MCA_ACCELERATOR_NO_DEVICE_ID,
87+
rbufsize * comm_size, MCA_ACCELERATOR_TRANSFER_HTOD);
88+
}
89+
90+
exit:
91+
if (NULL != sbuf1) {
92+
free(sbuf1);
93+
}
94+
if (NULL != rbuf1) {
95+
free(rbuf1);
96+
}
97+
98+
return rc;
99+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
3+
* Copyright (c) 2004-2023 The University of Tennessee and The University
4+
* of Tennessee Research Foundation. All rights
5+
* reserved.
6+
* Copyright (c) 2014-2015 NVIDIA Corporation. All rights reserved.
7+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
8+
* Copyright (c) 2024 Triad National Security, LLC. All rights reserved.
9+
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
10+
* $COPYRIGHT$
11+
*
12+
* Additional copyrights may follow
13+
*
14+
* $HEADER$
15+
*/
16+
17+
#include "ompi_config.h"
18+
#include "coll_accelerator.h"
19+
20+
#include <stdio.h>
21+
22+
#include "ompi/op/op.h"
23+
#include "opal/datatype/opal_convertor.h"
24+
25+
/*
26+
*
27+
* Function: - Bcast for device buffers through temp CPU buffer.
28+
* Accepts: - same as MPI_Bcast()
29+
* Returns: - MPI_SUCCESS or error code
30+
*/
31+
int
32+
mca_coll_accelerator_bcast(void *orig_buf, size_t count,
33+
struct ompi_datatype_t *datatype,
34+
int root,
35+
struct ompi_communicator_t *comm,
36+
mca_coll_base_module_t *module)
37+
{
38+
mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module;
39+
ptrdiff_t gap;
40+
char *buf1 = NULL;
41+
char *sbuf = (char*) orig_buf;
42+
int buf_dev;
43+
size_t bufsize;
44+
int rc;
45+
46+
bufsize = opal_datatype_span(&datatype->super, count, &gap);
47+
48+
rc = mca_coll_accelerator_check_buf((void *)orig_buf, &buf_dev);
49+
if (rc < 0) {
50+
return rc;
51+
}
52+
if ((rc > 0) && (bufsize <= (size_t)mca_coll_accelerator_bcast_thresh)) {
53+
buf1 = (char*)malloc(bufsize);
54+
if (NULL == buf1) {
55+
return OMPI_ERR_OUT_OF_RESOURCE;
56+
}
57+
mca_coll_accelerator_memcpy(buf1, MCA_ACCELERATOR_NO_DEVICE_ID, orig_buf, buf_dev, bufsize,
58+
MCA_ACCELERATOR_TRANSFER_DTOH);
59+
sbuf = buf1 - gap;
60+
}
61+
62+
rc = s->c_coll.coll_bcast((void *) sbuf, count, datatype, root, comm,
63+
s->c_coll.coll_bcast_module);
64+
if (rc < 0) {
65+
goto exit;
66+
}
67+
if (NULL != buf1) {
68+
mca_coll_accelerator_memcpy((void*)orig_buf, buf_dev, buf1, MCA_ACCELERATOR_NO_DEVICE_ID, bufsize,
69+
MCA_ACCELERATOR_TRANSFER_HTOD);
70+
}
71+
72+
exit:
73+
if (NULL != buf1) {
74+
free(buf1);
75+
}
76+
77+
return rc;
78+
}

ompi/mca/coll/accelerator/coll_accelerator_component.c

+32
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
#include "ompi/constants.h"
2323
#include "coll_accelerator.h"
2424

25+
26+
int mca_coll_accelerator_bcast_thresh = 256;
27+
int mca_coll_accelerator_allgather_thresh = 65536;
28+
int mca_coll_accelerator_alltoall_thresh = 65536;
29+
2530
/*
2631
* Public string showing the coll ompi_accelerator component version number
2732
*/
@@ -88,5 +93,32 @@ static int accelerator_register(void)
8893
MCA_BASE_VAR_SCOPE_READONLY,
8994
&mca_coll_accelerator_component.disable_accelerator_coll);
9095

96+
mca_coll_accelerator_bcast_thresh = 256;
97+
(void) mca_base_component_var_register(&mca_coll_accelerator_component.super.collm_version,
98+
"bcast_thresh",
99+
"max. msg length for which to copy accelerator buffer to CPU for bcast operation",
100+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
101+
OPAL_INFO_LVL_9,
102+
MCA_BASE_VAR_SCOPE_READONLY,
103+
&mca_coll_accelerator_bcast_thresh);
104+
105+
mca_coll_accelerator_allgather_thresh = 65536;
106+
(void) mca_base_component_var_register(&mca_coll_accelerator_component.super.collm_version,
107+
"allgather_thresh",
108+
"max. msg length for which to copy accelerator buffer to CPU for allgather operation",
109+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
110+
OPAL_INFO_LVL_9,
111+
MCA_BASE_VAR_SCOPE_READONLY,
112+
&mca_coll_accelerator_allgather_thresh);
113+
114+
mca_coll_accelerator_alltoall_thresh = 65536;
115+
(void) mca_base_component_var_register(&mca_coll_accelerator_component.super.collm_version,
116+
"alltoall_thresh",
117+
"max. msg length for which to copy accelerator buffer to CPU for alltoall operation",
118+
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
119+
OPAL_INFO_LVL_9,
120+
MCA_BASE_VAR_SCOPE_READONLY,
121+
&mca_coll_accelerator_alltoall_thresh);
122+
91123
return OMPI_SUCCESS;
92124
}

0 commit comments

Comments
 (0)