Skip to content

Commit

Permalink
Symmetric alloc of buffers involved in PP haloex and signalling objs
Browse files Browse the repository at this point in the history
- issue #5103 , #4748 
- adds support for symmetric alloc of `forces` & `coordinates` buffers used in PP haloexchange.
- symmetric alloc of signaling objects which shall be used for NVSHMEM enabled PP haloexchange.
- Adds foundational support required for device initiated communication enabled PP haloexchange via NVSHMEM.
- Works with both RF and PME inputs.
  • Loading branch information
mdoijade authored and mabraham committed Oct 30, 2024
1 parent 6a110ef commit 947c2ab
Show file tree
Hide file tree
Showing 24 changed files with 638 additions and 37 deletions.
7 changes: 7 additions & 0 deletions admin/ci-scripts/gromacs-base-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ if grep -qF 'NVIDIA' <<< "$GPU_VENDOR"; then
echo " with PME decomposition"
fi
fi
if [[ "$GMX_ENABLE_NVSHMEM" != "" ]] && [[ "$GPU_COUNT" -eq "2" ]]
then
# In CI with dual GPUs NVSHMEM cannot support more than 2 MPI processes/GPU
# which happens in multi sim tests so we disable them
echo "Disabling MdrunMultiSim tests as with GMX_ENABLE_NVSHMEM it does not work on dual GPU setup without MPS"
EXTRA_FLAGS="--exclude-regex MdrunMultiSim "
fi
# Speed up device re-initialization, especially when running multiple tests in parallel
export CUDA_DEVICE_MAX_CONNECTIONS=2 # default is 8
fi
Expand Down
20 changes: 19 additions & 1 deletion src/gromacs/domdec/domdec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
#include "gromacs/gmxlib/nrnb.h"
#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/gpu_utils/nvshmem_utils.h"
#include "gromacs/hardware/hw_info.h"
#include "gromacs/math/vec.h"
#include "gromacs/math/vectypes.h"
Expand Down Expand Up @@ -3069,7 +3070,7 @@ void constructGpuHaloExchange(const t_commrec& cr,
for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
{
cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
cr.dd, d, cr.mpi_comm_mygroup, deviceStreamManager.context(), pulse, wcycle));
cr.dd, d, cr.mpi_comm_mygroup, cr.mpi_comm_mysim, deviceStreamManager.context(), pulse, cr.useNvshmem, wcycle));
}
}
}
Expand All @@ -3078,11 +3079,27 @@ void reinitGpuHaloExchange(const t_commrec& cr,
const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
const DeviceBuffer<gmx::RVec> d_forcesBuffer)
{
int numDimsAndPulses = 0;
for (int d = 0; d < cr.dd->ndim; d++)
{
for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
{
cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
cr.dd->gpuHaloExchange[d][pulse]->reinitNvshmemSignal(cr, numDimsAndPulses++);
}
}
}

void destroyGpuHaloExchangeNvshmemBuf(const t_commrec& cr)
{
if (cr.nvshmemHandlePtr != nullptr)
{
for (int d = 0; d < cr.dd->ndim; d++)
{
for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
{
cr.dd->gpuHaloExchange[d][pulse]->destroyGpuHaloExchangeNvshmemBuf();
}
}
}
}
Expand All @@ -3092,6 +3109,7 @@ GpuEventSynchronizer* communicateGpuHaloCoordinates(const t_commrec& cr,
GpuEventSynchronizer* dependencyEvent)
{
GpuEventSynchronizer* eventPtr = dependencyEvent;

for (int d = 0; d < cr.dd->ndim; d++)
{
for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
Expand Down
4 changes: 4 additions & 0 deletions src/gromacs/domdec/domdec.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ void reinitGpuHaloExchange(const t_commrec& cr,
DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
DeviceBuffer<gmx::RVec> d_forcesBuffer);

/*! \brief Destructor for symmetric d_recvBuf used by NVSHMEM.
* \param [in] cr The commrec object
*/
void destroyGpuHaloExchangeNvshmemBuf(const t_commrec& cr);

/*! \brief GPU halo exchange of coordinates buffer.
* \param [in] cr The commrec object
Expand Down
15 changes: 15 additions & 0 deletions src/gromacs/domdec/gpuhaloexchange.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@

struct gmx_domdec_t;
struct gmx_wallcycle;
struct t_commrec;
class DeviceContext;
class DeviceStream;
class GpuEventSynchronizer;
Expand Down Expand Up @@ -86,15 +87,19 @@ class GpuHaloExchange
* \param [inout] dd domdec structure
* \param [in] dimIndex the dimension index for this instance
* \param [in] mpi_comm_mysim communicator used for simulation
* \param [in] mpi_comm_mysim_world communicator used for simulation with PP + PME.
* \param [in] deviceContext GPU device context
* \param [in] pulse the communication pulse for this instance
* \param [in] useNvshmem use NVSHMEM for communication
* \param [in] wcycle The wallclock counter
*/
GpuHaloExchange(gmx_domdec_t* dd,
int dimIndex,
MPI_Comm mpi_comm_mysim,
MPI_Comm mpi_comm_mysim_world,
const DeviceContext& deviceContext,
int pulse,
bool useNvshmem,
gmx_wallcycle* wcycle);
~GpuHaloExchange();
GpuHaloExchange(GpuHaloExchange&& source) noexcept;
Expand All @@ -108,6 +113,12 @@ class GpuHaloExchange
*/
void reinitHalo(DeviceBuffer<RVec> d_coordinateBuffer, DeviceBuffer<RVec> d_forcesBuffer);

/*! \brief
* (Re-) Initialization for NVSHMEM Signal objects
* \param [in] cr Communication structure ref.
* \param [in] signalObjOffset offset of the signal object corresponding to given pulse/dim.
*/
void reinitNvshmemSignal(const t_commrec& cr, int signalObjOffset);

/*! \brief GPU halo exchange of coordinates buffer.
*
Expand All @@ -132,6 +143,10 @@ class GpuHaloExchange
*/
GpuEventSynchronizer* getForcesReadyOnDeviceEvent();

/*! \brief Destructor for symmetric d_recvBuf used by NVSHMEM.
*/
void destroyGpuHaloExchangeNvshmemBuf();

private:
class Impl;
std::unique_ptr<Impl> impl_;
Expand Down
14 changes: 14 additions & 0 deletions src/gromacs/domdec/gpuhaloexchange_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,10 @@ class GpuHaloExchange::Impl
GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
int /* dimIndex */,
MPI_Comm /* mpi_comm_mysim */,
MPI_Comm /* mpi_comm_mysim_world */,
const DeviceContext& /* deviceContext */,
int /*pulse */,
bool /*useNvshmem*/,
gmx_wallcycle* /*wcycle*/) :
impl_(nullptr)
{
Expand All @@ -105,6 +107,12 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> /* d_coordinatesBuffer */,
"A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
}

void GpuHaloExchange::reinitNvshmemSignal(const t_commrec& /* cr */, int /* signalObjOffset */)
{
GMX_ASSERT(!impl_,
"A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
}

/*!\brief apply X halo exchange stub. */
GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
GpuEventSynchronizer* /*dependencyEvent*/)
Expand All @@ -131,6 +139,12 @@ GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
return nullptr;
}

void GpuHaloExchange::destroyGpuHaloExchangeNvshmemBuf()
{
GMX_ASSERT(!impl_,
"A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
}

} // namespace gmx

#endif // !GMX_GPU_CUDA && !GMX_GPU_SYCL
126 changes: 121 additions & 5 deletions src/gromacs/domdec/gpuhaloexchange_impl_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,18 @@

#include "config.h"

#if GMX_NVSHMEM
# include <nvshmem.h>
# include <nvshmemx.h>
#endif

#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/domdec/gpuhaloexchange.h"
#include "gromacs/gpu_utils/device_context.h"
#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.h"
#include "gromacs/gpu_utils/nvshmem_utils.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/gmxmpi.h"
Expand Down Expand Up @@ -112,11 +118,59 @@ void GpuHaloExchange::Impl::reinitHalo(DeviceBuffer<Float3> d_coordinatesBuffer,
if (newSize > maxPackedBufferSize_)
{
reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, deviceContext_);

if (useNvshmem_ && (maxPackedBufferSize_ > 0) && (newSize > sendBufSizeAlloc_))
{
GMX_RELEASE_ASSERT(d_sendBuf_ != nullptr,
"Halo exchange requires d_sendBuf_ buffer to be valid");
#if GMX_NVSHMEM
// unregister only when previous d_sendBuf_ was registered previously
// via nvshmemx_buffer_register
GMX_RELEASE_ASSERT(nvshmemx_buffer_unregister(d_sendBuf_) == 0,
"NVSHMEM d_sendBuf_ Buffer unregistration failed");
#endif
}
reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, deviceContext_);
reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, deviceContext_);
maxPackedBufferSize_ = newSize;

if (useNvshmem_ && (maxPackedBufferSize_ < sendBufSizeAlloc_))
{
// As d_sendBuf_ is a source buffer in the PP Halo exchange nvshmem_put
// we do not need to do a symmetric allocation for it, registering it via
// nvshmemx_buffer_register is sufficient.
#if GMX_NVSHMEM
std::size_t bufLen = sendBufSizeAlloc_ * sizeof(float3);
GMX_RELEASE_ASSERT(nvshmemx_buffer_register(d_sendBuf_, bufLen) == 0,
"NVSHMEM d_sendBuf_ Buffer registration failed");
#endif
}
// number of values/elems is same for indexMapSizeAlloc_/sendBufSizeAlloc_ so we can use either.
maxPackedBufferSize_ = sendBufSizeAlloc_;
}

int recvBufNewSize = newSize;
if (useNvshmem_)
{
MPI_Allreduce(&newSize, &recvBufNewSize, 1, MPI_INT, MPI_MAX, mpi_comm_mysim_world_);
#if GMX_MPI
// remote PE atomOffset to nvshmem put halo coordinates
MPI_Sendrecv(&atomOffset_,
sizeof(int),
MPI_BYTE,
recvRankX_,
0,
&nvshmemHaloExchange_.putAtomOffsetInReceiverRankXBuf_,
sizeof(int),
MPI_BYTE,
sendRankX_,
0,
mpi_comm_mysim_,
MPI_STATUS_IGNORE);
#endif
}

reallocateDeviceBuffer(
&d_recvBuf_, recvBufNewSize, &recvBufSize_, &recvBufSizeAlloc_, deviceContext_, useNvshmem_);

xSendSize_ = newSize;
#if GMX_MPI
MPI_Sendrecv(&xSendSize_,
Expand Down Expand Up @@ -200,6 +254,26 @@ void GpuHaloExchange::Impl::reinitHalo(DeviceBuffer<Float3> d_coordinatesBuffer,
wallcycle_stop(wcycle_, WallCycleCounter::Domdec);
}

void GpuHaloExchange::Impl::reinitNvshmemSignal(const t_commrec& cr, int signalObjOffset)
{
if (useNvshmem_)
{
GMX_RELEASE_ASSERT(cr.nvshmemHandlePtr->d_ppHaloExSyncBase_ != nullptr,
"NVSHMEM Coordinate Halo exchange requires valid signal buffer");
nvshmemHaloExchange_.signalObjOffset_ = signalObjOffset;
nvshmemHaloExchange_.d_signalSenderRankX_ = cr.nvshmemHandlePtr->d_ppHaloExSyncBase_;
// As only CUDA DeviceBuffer<> supports pointer updates from host side
// we guard these pointer update code by GMX_GPU_CUDA
#if GMX_GPU_CUDA
int totalPulsesAndDims = cr.nvshmemHandlePtr->ppHaloExPerSyncBufSize_;
nvshmemHaloExchange_.d_signalReceiverRankX_ =
cr.nvshmemHandlePtr->d_ppHaloExSyncBase_ + totalPulsesAndDims;
nvshmemHaloExchange_.d_signalReceiverRankF_ =
cr.nvshmemHandlePtr->d_ppHaloExSyncBase_ + 2 * totalPulsesAndDims;
#endif
}
}

void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
#if GMX_MPI
Expand Down Expand Up @@ -557,8 +631,10 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
int dimIndex,
MPI_Comm mpi_comm_mysim,
MPI_Comm mpi_comm_mysim_world,
const DeviceContext& deviceContext,
int pulse,
bool useNvshmem,
gmx_wallcycle* wcycle) :
dd_(dd),
sendRankX_(dd->neighbor[dimIndex][1]),
Expand All @@ -569,11 +645,13 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
haloXDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr),
haloFDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr),
mpi_comm_mysim_(mpi_comm_mysim),
mpi_comm_mysim_world_(mpi_comm_mysim_world),
deviceContext_(deviceContext),
haloStream_(new DeviceStream(deviceContext, DeviceStreamPriority::High, false)),
dimIndex_(dimIndex),
pulse_(pulse),
wcycle_(wcycle)
wcycle_(wcycle),
useNvshmem_(useNvshmem)
{
if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
{
Expand All @@ -587,19 +665,47 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,

GpuHaloExchange::Impl::~Impl()
{
if (useNvshmem_)
{
#if GMX_NVSHMEM
// As d_sendBuf_ is a source buffer in the PP Halo exchange nvshmem_put
// we had registered it via nvshmemx_buffer_register, such registered buffer
// need to be first unregistered via nvshmemx_buffer_unregister before freeing.
if (d_sendBuf_)
{
GMX_RELEASE_ASSERT(nvshmemx_buffer_unregister(d_sendBuf_) == 0,
"NVSHMEM d_sendBuf_ Buffer unregistration failed");
}
#endif
}
else
{
// For the NVSHMEM path the freeing of d_recvBuf_
// happens in destroyGpuHaloExchangeNvshmemBuf() due to it
// been a collective call calling it at this point is not appropriate
freeDeviceBuffer(&d_recvBuf_);
}

freeDeviceBuffer(&d_indexMap_);
freeDeviceBuffer(&d_sendBuf_);
freeDeviceBuffer(&d_recvBuf_);
freeDeviceBuffer(&d_fShift_);
}

void GpuHaloExchange::Impl::destroyGpuHaloExchangeNvshmemBuf()
{
// freeing the NVSHMEM symmetric buffer
freeDeviceBuffer(&d_recvBuf_);
}

GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd,
int dimIndex,
MPI_Comm mpi_comm_mysim,
MPI_Comm mpi_comm_mysim_world_,
const DeviceContext& deviceContext,
int pulse,
bool useNvshmem,
gmx_wallcycle* wcycle) :
impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, pulse, wcycle))
impl_(new Impl(dd, dimIndex, mpi_comm_mysim, mpi_comm_mysim_world_, deviceContext, pulse, useNvshmem, wcycle))
{
}

Expand All @@ -618,6 +724,16 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceB
impl_->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
}

void GpuHaloExchange::reinitNvshmemSignal(const t_commrec& cr, int signalObjOffset)
{
impl_->reinitNvshmemSignal(cr, signalObjOffset);
}

void GpuHaloExchange::destroyGpuHaloExchangeNvshmemBuf()
{
impl_->destroyGpuHaloExchangeNvshmemBuf();
}

GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix box,
GpuEventSynchronizer* dependencyEvent)
{
Expand Down
Loading

0 comments on commit 947c2ab

Please sign in to comment.