Refactor cudagraph-aware AllToAll: move cudagraph-aware logic out from GPE submit path

Regina8023 · meta-codesync[bot] · commit 06b1862323d5 · 2025-10-25T02:08:27.000-07:00
Summary: Refactored cudagraph-aware alltoall D77554973 to be similar to cudagraph-aware alltoallvDynamic D78133900: moved cudagraph-aware to a function, this helps simplify GPE submit function and also remove the Ctran.h dependency from ctranGPE

Reviewed By: minsii

Differential Revision: D83850249

fbshipit-source-id: 5516c0a1d5166fc65f12c8f349f69786b68b65fe
diff --git a/comms/ctran/algos/AllToAll/AllToAll.cc b/comms/ctran/algos/AllToAll/AllToAll.cc
@@ -6,6 +6,7 @@
 
 #include "comms/ctran/CtranComm.h"
 #include "comms/ctran/algos/AllToAll/AllToAllImpl.h"
+#include "comms/ctran/algos/AllToAll/AllToAllPImpl.h"
 #include "comms/ctran/algos/AllToAll/AllToAllvImpl.h"
 #include "comms/ctran/algos/CtranAlgo.h"
 #include "comms/ctran/gpe/CtranGpe.h"
@@ -125,12 +126,17 @@ commResult_t ctranAllToAll(
   std::vector<std::unique_ptr<struct OpElem>> opGroup;
   FB_COMMCHECK(setupGpeOp(
       sendbuff, recvbuff, count, datatype, comm, stream, opCount, opGroup));
-
+  ctran::PreLaunchGraphPrepareFn graphPrepareFn = nullptr;
+  if (NCCL_CTRAN_ALLTOALL_CUDAGRAPH_AWARE_ENABLE) {
+    graphPrepareFn = ctran::alltoallp::prepareCudagraphAwareAllToAll;
+  }
   FB_COMMCHECK(comm->ctran_->gpe->submit(
       std::move(opGroup),
       opIbImpl,
       config,
-      reinterpret_cast<void*>(ctran::alltoall::alltoallKerns[datatype])));
+      reinterpret_cast<void*>(ctran::alltoall::alltoallKerns[datatype]),
+      std::nullopt, /* timeout */
+      graphPrepareFn));
 
   return commSuccess;
 }
diff --git a/comms/ctran/algos/AllToAll/AllToAllP.cc b/comms/ctran/algos/AllToAll/AllToAllP.cc
@@ -30,9 +30,6 @@ commResult_t AllToAllPInit(
   const auto nRanks = statex->nRanks();
 
   SetCudaDevRAII setCudaDev(statex->cudaDev());
-  size_t size = maxRecvCount * commTypeSize(datatype);
-  void* regHdl{nullptr};
-  bool localReg = false;
   AlgoImpl* algo = new AlgoImpl(comm, stream);
   if (!algo) {
     return commSystemError;
@@ -43,29 +40,10 @@ commResult_t AllToAllPInit(
       delete algo;
     }
   });
-  // TODO: Pass-in a flag searchOnly to avoid dynamic register instead of reg
-  // then deregister.
-  FB_COMMCHECK(comm->ctran_->mapper->searchRegHandle(
-      recvbuff, size, &regHdl, &localReg));
-  if (localReg) {
-    comm->ctran_->mapper->deregDynamic(regHdl);
-    CLOGF(
-        ERR,
-        "recvbuff is not registered. Pointer: {} length: {}",
-        recvbuff,
-        size);
-    return commInternalError;
-  }
-
   std::string skip_ctrl_msg;
   hints.get("ncclx_alltoallp_skip_ctrl_msg_exchange", skip_ctrl_msg);
-  algo->pArgs = {
-      .recvbuff = recvbuff,
-      .recvHdl = regHdl,
-      .maxRecvCount = maxRecvCount,
-      .datatype = datatype,
-      .skipCtrlMsg = (skip_ctrl_msg == "true"),
-  };
+  FB_COMMCHECK(algo->setPArgs(
+      recvbuff, maxRecvCount, skip_ctrl_msg == "true", datatype));
   FB_COMMCHECK(algo->init());
   request = new CtranPersistentRequest(
       CtranPersistentRequest::Type::ALLTOALL_P, comm, stream);
@@ -77,11 +55,10 @@ commResult_t AllToAllPInit(
   CLOGF_SUBSYS(
       INFO,
       COLL,
-      "AllToAllPInit: rank {} initialized request {}, recvbuff {} recvHdl {}, comm {} commHash {:x} commDesc {} [nranks={}, localRanks={}] stream={}",
+      "AllToAllPInit: rank {} initialized request {}, recvbuff {}, comm {} commHash {:x} commDesc {} [nranks={}, localRanks={}] stream={}",
       statex->rank(),
       (void*)request,
       (void*)recvbuff,
-      (void*)regHdl,
       (void*)comm,
       statex->commHash(),
       statex->commDesc(),
diff --git a/comms/ctran/algos/AllToAll/AllToAllPImpl.cc b/comms/ctran/algos/AllToAll/AllToAllPImpl.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 
 #include "comms/ctran/algos/AllToAll/AllToAllPImpl.h"
+#include "Types.h"
 #include "comms/ctran/CtranComm.h"
 #include "comms/ctran/algos/AllToAll/AllToAllImpl.h"
 #include "comms/ctran/algos/AllToAll/AllToAllvImpl.h"
@@ -373,4 +374,48 @@ commResult_t AlgoImpl::exec(const void* sendbuff, const size_t count) {
       reinterpret_cast<void*>(ctran::alltoall::alltoallKerns[datatype])));
   return commSuccess;
 }
+
+commResult_t AlgoImpl::updatePersistentFuncAndOp(
+    opFunc& opFunc,
+    struct OpElem* op) {
+  opFunc = gpeFn;
+  op->type = OpElem::opType::ALLTOALLP;
+  op->alltoallP.sendbuff = op->alltoall.sendbuff;
+  op->alltoallP.count = op->alltoall.count;
+  op->alltoallP.pArgs = &pArgs;
+  CLOGF_TRACE(
+      COLL,
+      "AllToAllP: rank {} updated op to {} and gpeFn to persistent version.",
+      comm_->statex_->rank(),
+      (void*)op);
+  return commSuccess;
+}
+
+commResult_t prepareCudagraphAwareAllToAll(
+    opFunc& opFunc,
+    struct OpElem* op,
+    PersistentObj& pObj) {
+  pObj = std::make_unique<AlgoImpl>(op->comm_, op->stream);
+  auto algoImplPtr = std::get<std::unique_ptr<AlgoImpl>>(pObj).get();
+  if (!algoImplPtr) {
+    return commSystemError;
+  }
+
+  FB_COMMCHECK(algoImplPtr->setPArgs(
+      op->alltoall.recvbuff,
+      op->alltoall.count * op->comm_->statex_->nRanks(),
+      true /* skipCtrlMsg */,
+      op->alltoall.datatype));
+
+  // Exchange mem handles and record in pArgs. This will not be captured
+  // by cudagraph.
+  FB_COMMCHECK(algoImplPtr->init());
+
+  // Replace gpe func by the persistent version (skip exchanging mem
+  // handle); and OpGroup by the persistent op which has the remote
+  // handles recorded.
+
+  FB_COMMCHECK(algoImplPtr->updatePersistentFuncAndOp(opFunc, op));
+  return commSuccess;
+}
 } // namespace ctran::alltoallp
diff --git a/comms/ctran/algos/AllToAll/AllToAllPImpl.h b/comms/ctran/algos/AllToAll/AllToAllPImpl.h
@@ -3,21 +3,16 @@
 #pragma once
 
 #include <folly/synchronization/CallOnce.h>
+#include "Types.h"
 #include "comms/ctran/CtranComm.h"
+#include "comms/ctran/gpe/CtranGpe.h"
+#include "comms/ctran/hints/Hints.h"
+#include "comms/ctran/mapper/CtranMapper.h"
 #include "comms/ctran/mapper/CtranMapperTypes.h"
+#include "comms/ctran/utils/ExtUtils.h"
 #include "comms/utils/cvars/nccl_cvars.h"
 
 namespace ctran::alltoallp {
-struct PersistArgs {
-  void* recvbuff;
-  void* recvHdl;
-  size_t maxRecvCount;
-  commDataType_t datatype;
-  bool skipCtrlMsg;
-  std::vector<void*> remoteRecvBuffs;
-  std::vector<struct CtranMapperRemoteAccessKey> remoteAccessKeys;
-};
-
 class AlgoImpl {
  public:
   PersistArgs pArgs;
@@ -30,6 +25,40 @@ class AlgoImpl {
 
   commResult_t exec(const void* sendbuff, const size_t count);
 
+  inline commResult_t setPArgs(
+      void* recvbuff,
+      const size_t maxRecvCount,
+      bool skipCtrlMsg,
+      commDataType_t datatype) {
+    size_t size = maxRecvCount * commTypeSize(datatype);
+    void* regHdl{nullptr};
+    bool localReg = false;
+    // TODO: Pass-in a flag searchOnly to avoid dynamic register instead of reg
+    // then deregister.
+    FB_COMMCHECK(comm_->ctran_->mapper->searchRegHandle(
+        recvbuff, size, &regHdl, &localReg));
+    if (localReg) {
+      comm_->ctran_->mapper->deregDynamic(regHdl);
+      CLOGF(
+          ERR,
+          "recvbuff is not registered. Pointer: {} length: {}",
+          recvbuff,
+          size);
+      return commInternalError;
+    }
+
+    pArgs = {
+        .recvbuff = recvbuff,
+        .recvHdl = regHdl,
+        .maxRecvCount = maxRecvCount,
+        .datatype = datatype,
+        .skipCtrlMsg = skipCtrlMsg,
+    };
+    return commSuccess;
+  }
+
+  commResult_t updatePersistentFuncAndOp(opFunc& opFunc, struct OpElem* op);
+
   static inline const std::string algoName(enum NCCL_ALLTOALL_ALGO algo) {
     switch (algo) {
       case NCCL_ALLTOALL_ALGO::ctran:
@@ -43,4 +72,9 @@ class AlgoImpl {
   CtranComm* comm_{nullptr};
   cudaStream_t stream_{nullptr};
 };
+
+commResult_t prepareCudagraphAwareAllToAll(
+    opFunc& opFunc,
+    struct OpElem* op,
+    PersistentObj& pObj);
 } // namespace ctran::alltoallp
diff --git a/comms/ctran/algos/AllToAll/Types.h b/comms/ctran/algos/AllToAll/Types.h
@@ -0,0 +1,21 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+#include <vector>
+
+#include "comms/ctran/mapper/CtranMapperTypes.h"
+#include "comms/utils/commSpecs.h"
+
+namespace ctran::alltoallp {
+struct PersistArgs {
+  void* recvbuff;
+  void* recvHdl;
+  size_t maxRecvCount;
+  commDataType_t datatype;
+  bool skipCtrlMsg;
+  std::vector<void*> remoteRecvBuffs;
+  std::vector<struct CtranMapperRemoteAccessKey> remoteAccessKeys;
+};
+
+class AlgoImpl;
+} // namespace ctran::alltoallp
diff --git a/comms/ctran/gpe/CtranGpe.cc b/comms/ctran/gpe/CtranGpe.cc
@@ -373,14 +373,16 @@ commResult_t CtranGpe::submit(
     opFunc func,
     KernelConfig& kernelConfig,
     const void* ncclKernel,
-    std::optional<std::chrono::milliseconds> timeout) {
+    std::optional<std::chrono::milliseconds> timeout,
+    PreLaunchGraphPrepareFn graphPrepareFn) {
   return this->pimpl->submit(
       CtranGpeCmd::TypeEnum::GRAPH_ENQUEUE,
       std::move(opGroup),
       func,
       kernelConfig,
       ncclKernel,
-      timeout);
+      timeout,
+      graphPrepareFn);
 }
 
 commResult_t CtranGpe::submitHost(
diff --git a/comms/ctran/gpe/CtranGpe.h b/comms/ctran/gpe/CtranGpe.h
@@ -12,6 +12,7 @@
 
 #include "comms/ctran/CtranComm.h"
 #include "comms/ctran/CtranExImpl.h"
+#include "comms/ctran/algos/AllToAll/Types.h"
 #include "comms/ctran/algos/CtranAlgoDev.h"
 #include "comms/ctran/algos/common/GpeKernelSync.h"
 #include "comms/ctran/gpe/CtranGpeDev.h"
@@ -20,6 +21,13 @@
 typedef commResult_t (*opFunc)(
     const std::vector<std::unique_ptr<struct OpElem>>& opGroup);
 
+namespace ctran {
+using PersistentObj =
+    std::variant<std::monostate, std::unique_ptr<ctran::alltoallp::AlgoImpl>>;
+using PreLaunchGraphPrepareFn =
+    commResult_t (*)(opFunc& opFunc, struct OpElem* op, PersistentObj& pObj);
+} // namespace ctran
+
 struct OpElem {
   enum opType {
     ALLGATHER,
@@ -369,7 +377,8 @@ class CtranGpe {
       opFunc func,
       KernelConfig& kernelConfig,
       const void* ncclKernel,
-      std::optional<std::chrono::milliseconds> timeout = std::nullopt);
+      std::optional<std::chrono::milliseconds> timeout = std::nullopt,
+      ctran::PreLaunchGraphPrepareFn graphPrepareFn = nullptr);
 
   // Submit host mem communication. No kernel is launched, and only the host
   // side func will be submitted to the GPE thread. Also the op won't be
diff --git a/comms/ctran/gpe/CtranGpeImpl.cc b/comms/ctran/gpe/CtranGpeImpl.cc
@@ -6,7 +6,7 @@
 
 #include <folly/dynamic.h>
 
-#include "comms/ctran/Ctran.h"
+#include "comms/ctran/algos/AllToAll/AllToAllPImpl.h"
 #include "comms/ctran/algos/AllToAll/AllToAllvDynamicPImpl.h"
 #include "comms/ctran/algos/common/GpeKernel.h"
 #include "comms/ctran/gpe/CtranChecksum.h"
@@ -17,7 +17,6 @@
 #include "comms/ctran/tracing/CollTraceWrapper.h"
 #include "comms/ctran/tracing/MapperTrace.h"
 #include "comms/ctran/utils/Checks.h"
-#include "comms/ctran/utils/CudaGraphUtils.h"
 #include "comms/ctran/utils/CudaWrap.h"
 #include "comms/ctran/utils/Debug.h"
 #include "comms/ctran/utils/Exception.h"
@@ -150,7 +149,8 @@ commResult_t CtranGpe::Impl::submit(
     opFunc func,
     KernelConfig& kernelConfig,
     const void* ncclKernel,
-    std::optional<std::chrono::milliseconds> timeout) {
+    std::optional<std::chrono::milliseconds> timeout,
+    PreLaunchGraphPrepareFn graphPrepareFn) {
   commResult_t res = commSuccess;
 
   // Reclaim once to gain back available flags
@@ -250,49 +250,13 @@ commResult_t CtranGpe::Impl::submit(
       }
       cmd->coll.comm = comm;
     }
-
     if (streamCaptureInfo.status == cudaStreamCaptureStatusActive) {
+      FB_COMMCHECK(preLaunchGraphPrepare(cmd, graphPrepareFn));
       struct cmdCbPlan* plan = new struct cmdCbPlan;
       // cudagraph-aware alltoall: transfer alltoall to alltoallPersistent for
       // perf optimization
       auto op = cmd->coll.opGroup.front().get();
       if (NCCL_CTRAN_ALLTOALL_CUDAGRAPH_AWARE_ENABLE &&
-          op->type == OpElem::opType::ALLTOALL) {
-        CtranPersistentRequest* pReq;
-        // FIXME: update alltoall API to allow passing hints to skip/not skip
-        // ctrl msg exchange.
-        meta::comms::Hints hints;
-        hints.set("ncclx_alltoallp_skip_ctrl_msg_exchange", "true");
-        // The init will submit a GPE op exchangeMemHandle that not captured by
-        // cudagraph.
-        // FIXME: for cudagraph, the sendbuff is also persistent, should record
-        // its handle in pReq and skip searchRegHandle in exec.
-        // FIXME: the gpe thread should call algo impl instead of user API to
-        // allow more flexibility in cudagraph mode.
-        ctran::AllToAllPInit(
-            op->alltoall.recvbuff,
-            op->alltoall.count * op->comm_->statex_->nRanks(),
-            hints,
-            op->alltoall.datatype,
-            op->comm_,
-            op->stream,
-            pReq);
-
-        // Capture alltoallp op instead of alltoall because alltoall under
-        // cudagraph is essentially alltoallp. A new alltoallp op will be
-        // submitted inside AllToAllPExec so we can return once it's done.
-        // Release kernel args grabbed earlier
-        if (kernelFlag != nullptr) {
-          kernelFlag->reset();
-        }
-        // Add callback for alltoallp cmd instead.
-        // FIXME: the gpe thread should call algo impl instead of user API to
-        // allow more flexibility in cudagraph mode.
-        FB_COMMCHECK(ctran::AllToAllPExec(
-            op->alltoall.sendbuff, op->alltoall.count, pReq));
-        return commSuccess;
-      } else if (
-          NCCL_CTRAN_ALLTOALL_CUDAGRAPH_AWARE_ENABLE &&
           op->type == OpElem::opType::ALLTOALLV_DYNAMIC_SPLIT_NON_CONTIG) {
         // FIXME: this should control by hints passed from user instead of CVAR
         // so we can have per-collective control
diff --git a/comms/ctran/gpe/CtranGpeImpl.h b/comms/ctran/gpe/CtranGpeImpl.h
diff --git a/comms/ctran/utils/CudaGraphUtils.h b/comms/ctran/utils/CudaGraphUtils.h