facebookresearch · TaekyungHeo · Mar 19, 2025 · Mar 21, 2025 · Mar 21, 2025 · Apr 16, 2025
@@ -1,2 +1,3 @@
 .venv/
+.vscode/
 __pycache__/
@@ -127,7 +127,6 @@ class BaseBackend(ABC):
     def __init__(self) -> None:
         self.tcp_store = None
         self.collectiveFunc = {
-            "all_to_all_single": self.all_to_all_single,  # pyre-ignore[16]:
             "all_to_all": self.all_to_all,
             "all_to_allv": self.all_to_allv,
             "all_reduce": self.all_reduce,

@@ -233,14 +233,15 @@ def all_to_all(
                 group=self.get_collective_group(collectiveArgs),
                 async_op=collectiveArgs.asyncOp,
             )
-
+            
         if collectiveArgs.asyncOp:
             collectiveArgs.waitObj.append(work)
 
         if retFlag:
             return work
 
     def all_to_allv(self, collectiveArgs, retFlag=False, pair=False):
+        # cpp layer all_to_allv is corresponding to python layer all_to_all_single
         # pair=True mode does not support quantization
         if (
             collectiveArgs.all2all_qcomm
@@ -301,25 +302,6 @@ def all_to_allv(self, collectiveArgs, retFlag=False, pair=False):
         if retFlag:
             return work
 
-    def all_to_all_single(self, collectiveArgs, retFlag=False, pair=False):
-        # does not support quantization
-        if collectiveArgs.all2all_qcomm:
-            logger.warn("all_to_all_single does not support quantization")
-            return
-
-        work = dist.all_to_all_single(
-            collectiveArgs.opTensor if not pair else collectiveArgs.opTensor_pair,
-            collectiveArgs.ipTensor if not pair else collectiveArgs.ipTensor_pair,
-            group=collectiveArgs.group,
-            async_op=collectiveArgs.asyncOp,
-        )
-
-        if collectiveArgs.asyncOp:
-            collectiveArgs.waitObj.append(work)
-
-        if retFlag:
-            return work
-
     def all_gather(self, collectiveArgs, retFlag=False, pair=False):
         if self.use_ext_dist:
             retObj = collectiveArgs.group.all_gather(

@@ -1,6 +1,7 @@
 # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 from __future__ import annotations
 
+import math
 import json
 
 import logging
@@ -202,7 +203,7 @@ def _parse_comms_op_node(  # noqa: C901
                 comm_args.worldSize = total_ranks
             comm_args.inSplit = json.loads(node.commArgs.in_split_size)
             comm_args.outSplit = json.loads(node.commArgs.out_split_size)
-
+     
         comms_op_list.append(comm_args)
 
     return comms_op_list

@@ -107,7 +107,6 @@ def fixBeginSize(commsParams: commsParamsHolder, world_size: int) -> None:
     if commsParams.collective in (
         "all_to_all",
         "all_to_allv",
-        "all_to_all_single",
         "all_gather",
         "all_gather_base",
         "gather",
@@ -293,14 +292,13 @@ def checkQuantArgs(
     if collective not in (
         "all_to_all",
         "all_to_allv",
-        "all_to_all_single",
         "reduce",
         "all_reduce",
     ):
         raise NotImplementedError(
             f"quantized communication for {collective} is currently unsupported."
         )
-    if collective in ("all_to_all", "all_to_allv", "all_to_all_single"):
+    if collective in ("all_to_all", "all_to_allv"):
         if (beginSize // 4) % quant_a2a_embedding_dim != 0:
             logger.warning(
                 f"begin size {beginSize} must be a multiple of --quant-a2a-embedding-dim {quant_a2a_embedding_dim} for all_to_all operation"
@@ -342,7 +340,6 @@ def paramToCommName(name: str, supported_comms: list[str] | None = None) -> str:
         "alltoall": "all_to_all",
         "alltoallv": "all_to_allv",
         "alltoallbase": "all_to_allv",
-        "alltoallsingle": "all_to_all_single",
         "allreduce": "all_reduce",
         "allgather": "all_gather",
         "allgatherbase": "all_gather_base",
@@ -873,56 +870,17 @@ def _prep_all_to_allv(
         opTensor = torch.Tensor()
         if allocate:
             # all_to_allv requires two tensors
+            # ipTensor has been allocated outside of this function, just pass in
             opTensor = self.backendFuncs.alloc_random(
                 [numElementsOut], curDevice, dtype, scaleFactor
             )
         # recorded splits in trace is only for dim 0, but tensor in replay has been flattened.
         # need to recalculate the splits for flattened 1D tensor
-        self.collectiveArgs.opTensor_split = (
-            [numElementsOut // sum(curComm.outSplit) * i for i in curComm.outSplit]
-            if curComm.outSplit
-            else None
-        )
-        self.collectiveArgs.ipTensor_split = (
-            [numElementsIn // sum(curComm.inSplit) * i for i in curComm.inSplit]
-            if curComm.inSplit
-            else None
-        )
-        return (ipTensor, opTensor)
-
-    def _prep_all_to_all_single(
-        self,
-        ipTensor: torch.Tensor,
-        curComm: commsArgs,
-        commsParams: commsParamsHolderBase,
-        numElementsIn: int,
-        numElementsOut: int,
-        world_size: int,
-        curDevice: str,
-        dtype: torch.dtype,
-        scaleFactor: float,
-        allocate: bool = True,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        ipTensor = torch.Tensor()
-        opTensor = torch.Tensor()
-        if allocate:
-            if commsParams.dcheck == 1:
-                ipTensor = self.backendFuncs.alloc_ones(
-                    [numElementsIn],
-                    curDevice,
-                    commsParams.dtype,
-                    self.initVal,
-                )
-            else:
-                ipTensor = self.backendFuncs.alloc_random(
-                    [numElementsIn],
-                    curDevice,
-                    commsParams.dtype,
-                    scaleFactor,
-                )
-            opTensor = self.backendFuncs.alloc_random(
-                [numElementsOut], curDevice, dtype, scaleFactor
-            )
+        # corner case: one rank sends zeor data out, but receives data from other ranks, and vice versa.
+        self.collectiveArgs.opTensor_split = \
+            [numElementsOut // max(sum(curComm.outSplit), 1) * i for i in curComm.outSplit] if curComm.outSplit else None
+        self.collectiveArgs.ipTensor_split = \
+            [numElementsIn // max(sum(curComm.inSplit), 1) * i for i in curComm.inSplit] if curComm.inSplit else None
         return (ipTensor, opTensor)
 
     def _prep_all_to_all(
@@ -941,19 +899,11 @@ def _prep_all_to_all(
         ipTensor = []
         opTensor = []
         if allocate:
-            alloc_func = (
-                self.backendFuncs.alloc_ones
-                if commsParams.dcheck == 1
-                else self.backendFuncs.alloc_random
-            )
-            ipTensor = [
-                alloc_func(i, curDevice, commsParams.dtype, self.initVal)
-                for i in curComm.inSplit
-            ]
-            opTensor = [
-                alloc_func(i, curDevice, commsParams.dtype, self.initVal)
-                for i in curComm.outSplit
-            ]
+            i_alloc_func = self.backendFuncs.alloc_ones if commsParams.dcheck == 1 else self.backendFuncs.alloc_random
+            i_scale_factor = self.initVal if commsParams.dcheck == 1 else scaleFactor
+            ipTensor = [i_alloc_func([i], curDevice, commsParams.dtype, i_scale_factor) for i in curComm.inSplit]
+
+            opTensor = [self.backendFuncs.alloc_random([i], curDevice, commsParams.dtype, scaleFactor) for i in curComm.outSplit]
         return (ipTensor, opTensor)
 
     def _prep_all_gather(
@@ -1240,7 +1190,6 @@ def prepComm(
         # TODO: consider using this dictionary to check valid keywords rather than silently defaulting
 
         dispatchDict = {
-            "all_to_all_single": self._prep_all_to_all_single,
             "all_to_allv": self._prep_all_to_allv,
             "all_to_all": self._prep_all_to_all,
             "all_gather": self._prep_all_gather,