Skip to content

Commit 9aafefa

Browse files
Parallel-Split MPI_Alltoall algorithm as part of acoll collective.
-A new parallel-split algorithm for MPI_Alltoall is introduced as part of acoll collective component, primarily targeting smaller message sizes (<= 4KB). The algorithm, at a high level, operates by diving the ranks into n groups, performing alltoall (using a base alltoall routine) within the n groups in parallel, following which data is exchanged between groups of n adjacent ranks (starting from rank 0). For example if n=2, this algorithm splits the ranks into 2 groups, one containing all even ranked processes and another containing all odd ranked processes. Alltoall is performed within these 2 groups in parallel, followed by which each adjacent even-odd pairs (pairs being [0,1], [2,3],..) exchanges data to complete Alltoall operation. If n =4 or n=8, alltoall is performed within 4 or 8 groups in parallel. Following this step, groups of adjacent 4 or 8 ranks(starting from 0) exchanges data among themselves to complete the alltoall operation. Signed-off-by: Mithun Mohan <[email protected]>
1 parent 13d0522 commit 9aafefa

File tree

6 files changed

+830
-3
lines changed

6 files changed

+830
-3
lines changed

ompi/mca/coll/acoll/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ sources = \
1515
coll_acoll_allgather.c \
1616
coll_acoll_bcast.c \
1717
coll_acoll_gather.c \
18+
coll_acoll_alltoall.c \
1819
coll_acoll_reduce.c \
1920
coll_acoll_allreduce.c \
2021
coll_acoll_barrier.c \

ompi/mca/coll/acoll/coll_acoll.h

+31-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
3+
* Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
44
* $COPYRIGHT$
55
*
66
* Additional copyrights may follow
@@ -66,6 +66,13 @@ int mca_coll_acoll_gather_intra(const void *sbuf, size_t scount, struct ompi_dat
6666
void *rbuf, size_t rcount, struct ompi_datatype_t *rdtype, int root,
6767
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);
6868

69+
int mca_coll_acoll_alltoall(const void *sbuf, size_t scount,
70+
struct ompi_datatype_t *sdtype,
71+
void* rbuf, size_t rcount,
72+
struct ompi_datatype_t *rdtype,
73+
struct ompi_communicator_t *comm,
74+
mca_coll_base_module_t *module);
75+
6976
int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
7077
struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root,
7178
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);
@@ -80,6 +87,8 @@ int mca_coll_acoll_barrier_intra(struct ompi_communicator_t *comm, mca_coll_base
8087
END_C_DECLS
8188

8289
#define MCA_COLL_ACOLL_ROOT_CHANGE_THRESH 10
90+
#define MCA_COLL_ACOLL_SPLIT_FACTOR_LIST_LEN 6
91+
#define MCA_COLL_ACOLL_SPLIT_FACTOR_LIST {2, 4, 8, 16, 32, 64}
8392

8493
typedef enum MCA_COLL_ACOLL_SG_SIZES {
8594
MCA_COLL_ACOLL_SG_SIZE_1 = 8,
@@ -142,6 +151,18 @@ typedef struct coll_acoll_data {
142151
int sync[2];
143152
} coll_acoll_data_t;
144153

154+
/* The enum literals are used as indices into arrays and values are
155+
* assigned to the enum literals so as to ensure it is valid irrespective
156+
* of what the compiler assigns. */
157+
typedef enum MCA_COLL_ACOLL_R2R_DIST {
158+
DIST_CORE = 0,
159+
DIST_L3CACHE,
160+
DIST_NUMA,
161+
DIST_SOCKET,
162+
DIST_NODE,
163+
DIST_END
164+
} MCA_COLL_ACOLL_R2R_DIST_T;
165+
145166
typedef struct coll_acoll_subcomms {
146167
ompi_communicator_t *local_comm;
147168
ompi_communicator_t *local_r_comm;
@@ -152,6 +173,7 @@ typedef struct coll_acoll_subcomms {
152173
ompi_communicator_t *orig_comm;
153174
ompi_communicator_t *socket_comm;
154175
ompi_communicator_t *socket_ldr_comm;
176+
ompi_communicator_t *split_comm[MCA_COLL_ACOLL_SPLIT_FACTOR_LIST_LEN]; // AllToAll odd even split comm
155177
int num_nodes;
156178
int derived_node_size;
157179
int is_root_node;
@@ -170,6 +192,7 @@ typedef struct coll_acoll_subcomms {
170192
int initialized;
171193
int prev_init_root;
172194
int num_root_change;
195+
MCA_COLL_ACOLL_R2R_DIST_T r2r_dist;
173196

174197
ompi_communicator_t *numa_comm_ldrs;
175198
ompi_communicator_t *node_comm;
@@ -193,6 +216,12 @@ typedef struct coll_acoll_reserve_mem {
193216
bool reserve_mem_in_use;
194217
} coll_acoll_reserve_mem_t;
195218

219+
typedef struct {
220+
int split_factor;
221+
size_t psplit_msg_thresh;
222+
size_t xpmem_msg_thresh;
223+
} coll_acoll_alltoall_attr_t;
224+
196225
struct mca_coll_acoll_module_t {
197226
mca_coll_base_module_t super;
198227
MCA_COLL_ACOLL_SG_SIZES sg_size;
@@ -218,6 +247,7 @@ struct mca_coll_acoll_module_t {
218247
coll_acoll_subcomms_t **subc;
219248
coll_acoll_reserve_mem_t reserve_mem_s;
220249
int num_subc;
250+
coll_acoll_alltoall_attr_t alltoall_attr;
221251
};
222252

223253
#ifdef HAVE_XPMEM_H

0 commit comments

Comments
 (0)