Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ ifeq ($(MPI), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
else ifeq ($(MPICH), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
HIPLDFLAGS += -L/usr/lib -lmpich
HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich -I/usr/include/mpich-x86_64/
HIPLDFLAGS += -L/usr/lib -L/usr/lib64/mpich/lib/ -lmpich
endif

LIBRARIES += rccl
Expand Down
12 changes: 4 additions & 8 deletions src/alltoallv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "cuda_runtime.h"
#include "common.h"

#define USE_RCCL_GATHER_SCATTER
//#define USE_RCCL_GATHER_SCATTER

void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
if (count < nranks*nranks/2) {
Expand Down Expand Up @@ -54,12 +54,12 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
size_t chunksize = data_count/nranks;
for (int j=0; j<nranks; j++) {
size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
if ((j+rank)%nranks == 0)
if (j+rank == nranks-1)
rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
size_t sdisp = 0;
for (int k=0; k<nranks; k++) {
scount = ((k+j)%nranks)*chunksize;
if ((k+j)%nranks == 0)
if (k+j == nranks-1)
scount += (sendcount-chunksize*(nranks-1)*nranks/2);
if (k == rank)
break;
Expand Down Expand Up @@ -105,7 +105,7 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
size_t chunksize = count*2/nranks;
for (int i = 0; i < nranks; i++) {
size_t scount = ((i+rank)%nranks)*chunksize;
if ((i+rank)%nranks == 0)
if (i+rank == nranks-1)
scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
sendcounts[i+rank*nranks] = recvcounts[i+rank*nranks] = scount;
sdispls[i+rank*nranks] = rdispls[i+rank*nranks] = disp;
Expand All @@ -122,24 +122,20 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
#else
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nranks; r++) {
if (sendcounts[r+rank*nranks] != 0) {
NCCLCHECK(ncclSend(
((char*)sendbuff) + sdispls[r+rank*nranks] * wordSize(type),
sendcounts[r+rank*nranks],
type,
r,
comm,
stream));
}
if (recvcounts[r+rank*nranks] != 0) {
NCCLCHECK(ncclRecv(
((char*)recvbuff) + rdispls[r+rank*nranks] * wordSize(type),
recvcounts[r+rank*nranks],
type,
r,
comm,
stream));
}
}
NCCLCHECK(ncclGroupEnd());
#endif
Expand Down