add debugability for baby pg (#213)

tushar00jain · web-flow · commit 7898bfd2af6e · 2025-06-12T14:12:37.000-07:00
* support async in nccl pg Summary: - set the same stream as the one used for work in future continuations so that random streams don't depend on pg stream (this can make these streams dependent on the allreduce stream) - wait on the work sent to pg's immediately on the fragment streams (used for allreduce) to make them depend on the pg stream and so that they don't depend on any future work that's submitted to those streams - copy grads before allreduce so that the inner optimization can use those and it doesn't create a dependency between the default stream and the pg stream - add back support for quantized allreduce in manager - change return types to be consistent with pg allreduce - the returned future from quantization collectives hangs (likely because set_result is not called?) so changed it to return the future directly from the pg Test Plan: - tested the changes with nccl pg - synchronize on recovery stream sometimes makes the cpu block on collective (probably because some callback gets scheduled on the recovery stream? we need to remove synchronizing on recovery stream when there is no need to) - calling `work.wait` returned by baby nccl pg makes the cpu block on the collective (because 2 contexts can't overlap?) - pg gloo needs us to call `future.wait` in the sync phase instead of the prepare phase, so we probably need a different wrapper - same for baby gloo pg > Without Quantization <img width="1188" alt="image" src="https://github.com/user-attachments/assets/8f8dd694-a972-4bc6-96a0-8a79627a4d5d" /> > With Quantization <img width="1123" alt="image" src="https://github.com/user-attachments/assets/b54288a3-9727-4956-89e7-c8b8775a98aa" /> * add debugability for baby pg Summary: - running multiple processes a few limitations - we can't get gpu profiles from subprocesses - the results can differ because of cuda using a different context that can't run concurrently, this can make it hard to debug if there's something wrong with the code or if it's an artefact of cuda context - use multiprocessing.dummy to use threads instead of process Test Plan: using the patch with baby nccl, we can get overlapping communication and computation <img width="1539" alt="image" src="https://github.com/user-attachments/assets/39152858-1373-4318-8646-398141db3072" /> we cannot get the overlap when using multiple processes, indicating it has something to do with cuda context <img width="1537" alt="image" src="https://github.com/user-attachments/assets/6b823d8e-a152-4678-a7e4-b6b8d6b6bb54" />
diff --git a/torchft/multiprocessing_dummy_context.py b/torchft/multiprocessing_dummy_context.py
@@ -0,0 +1,135 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Multiprocessing Dummy Context
+=========================
+
+This module provides a context-like interface for multiprocessing.dummy,
+which is a wrapper around the threading module that provides a multiprocessing-like
+interface but uses threads instead of processes.
+
+This allows code that uses multiprocessing.get_context() to work with
+multiprocessing.dummy by providing a compatible interface.
+"""
+
+import multiprocessing.dummy as mp
+import threading
+from typing import Callable, Iterable, Mapping
+
+
+class DummyContext:
+    """
+    A context-like class for multiprocessing.dummy that mimics the interface
+    of a context returned by multiprocessing.get_context().
+    """
+
+    def __init__(self, method: object = None) -> None:
+        """
+        Initialize the dummy context.
+
+        Args:
+            method: Ignored, only for compatibility with multiprocessing.get_context()
+        """
+        pass
+
+    def Process(
+        self,
+        group: object = None,
+        target: Callable[..., object] | None = None,
+        name: str | None = None,
+        args: Iterable[object] = (),
+        kwargs: Mapping[str, object] = {},
+        daemon: bool | None = None,
+    ) -> mp.DummyProcess:
+        """
+        Create a Process using multiprocessing.dummy.Process.
+        """
+        return mp.Process(
+            group=group, target=target, name=name, args=args, kwargs=kwargs
+        )
+
+    def Pipe(
+        self, duplex: bool = True
+    ) -> tuple[mp.connection.Connection, mp.connection.Connection]:
+        """
+        Create a Pipe using multiprocessing.dummy.Pipe.
+        """
+        return mp.Pipe(duplex)
+
+    def Queue(self, maxsize: int = 0) -> mp.Queue:
+        """
+        Create a Queue using multiprocessing.dummy.Queue.
+        """
+        return mp.Queue(maxsize)
+
+    def Event(self) -> threading.Event:
+        """
+        Create an Event using multiprocessing.dummy.Event.
+        """
+        return mp.Event()
+
+    def Lock(self) -> threading.Lock:
+        """
+        Create a Lock using multiprocessing.dummy.Lock.
+        """
+        return mp.Lock()
+
+    def RLock(self) -> threading.RLock:
+        """
+        Create an RLock using multiprocessing.dummy.RLock.
+        """
+        return mp.RLock()
+
+    def Semaphore(self, value: int = 1) -> threading.Semaphore:
+        """
+        Create a Semaphore using multiprocessing.dummy.Semaphore.
+        """
+        return mp.Semaphore(value)
+
+    def BoundedSemaphore(self, value: int = 1) -> threading.BoundedSemaphore:
+        """
+        Create a BoundedSemaphore using multiprocessing.dummy.BoundedSemaphore.
+        """
+        return mp.BoundedSemaphore(value)
+
+    def Condition(
+        self, lock: threading.Lock | threading.RLock | None = None
+    ) -> threading.Condition:
+        """
+        Create a Condition using multiprocessing.dummy.Condition.
+        """
+        return mp.Condition(lock)
+
+    def Manager(self) -> object:
+        """
+        Create a Manager using multiprocessing.dummy.Manager.
+        """
+        return mp.Manager()
+
+
+def get_context(method: object = None) -> DummyContext:
+    """
+    Return a context object for multiprocessing.dummy.
+
+    This function mimics multiprocessing.get_context() but returns a DummyContext
+    that works with multiprocessing.dummy. This can be used to patch
+    multiprocessing.dummy like so
+
+
+    ```
+    import multiprocessing.dummy as mp
+    from torchft.multiprocessing_dummy_context import get_context
+    mp.get_context = get_context
+    ```
+
+    Args:
+        method: Ignored, only for compatibility with multiprocessing.get_context()
+
+    Returns:
+        A DummyContext instance
+    """
+    return DummyContext(method)