add streaming diloco tests - upscale

tushar00jain · tushar00jain · commit fd873eb32841 · 2025-06-23T16:09:04.000-07:00
Summary:
- add a "barrier injector" to allow replicas to join at specific steps
- added a test to validate streaming diloco works when new node joins without any failures
-  remove some code duplication

Test Plan:
```
pytest -vs ./torchft/local_sgd_integ_test.py
```
diff --git a/torchft/local_sgd_integ_test.py b/torchft/local_sgd_integ_test.py
@@ -3,23 +3,26 @@
 import os
 import re
 import sys
+import threading
 import traceback
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import ExitStack
+from dataclasses import field
 from datetime import timedelta
 from typing import Any, Dict
 from unittest import TestCase, skipIf
 
 import torch
 from parameterized import parameterized
 from torch import nn, optim
+from torch.distributed.pipelining import SplitPoint, pipeline
 from torch.distributed.tensor import DTensor, Replicate
 
 from torchft._torchft import LighthouseServer
 from torchft.device_mesh import ft_init_device_mesh
 from torchft.local_sgd import DiLoCo, LocalSGD
 from torchft.manager import Manager
-from torchft.manager_integ_test import FailureInjector, MyModel, Runner
+from torchft.manager_integ_test import BarrierInjector, FailureInjector, MyModel, Runner
 from torchft.process_group import ProcessGroupBabyNCCL, ProcessGroupGloo
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -254,6 +257,10 @@ def state_dict() -> Dict[str, Dict[str, object]]:  # pyre-ignore[53]
                     all_state_dicts[manager_curr_step] = copy.deepcopy(
                         manager._manager_state_dict()
                     )
+
+                if runner.barrier_injector is not None:
+                    runner.barrier_injector.check(manager_curr_step)
+
                 batch_size = 1
                 inputs = m.get_rand_inputs(batch_size, device=device)
                 labels = m.get_rand_labels(batch_size, device=device)
@@ -276,6 +283,26 @@ def state_dict() -> Dict[str, Dict[str, object]]:  # pyre-ignore[53]
     return {}
 
 
+def assert_equal_global_state(
+    rep0: dict[str, dict[str, dict[str, dict[str, object]]]],
+    rep1: dict[str, dict[str, dict[str, dict[str, object]]]],
+) -> None:
+    """
+    Asserts that the global state of the two replicas are equal
+    """
+    for step in rep0.keys():
+        torch.testing.assert_close(
+            rep1[step]["user"]["default"]["original_params"],
+            rep0[step]["user"]["default"]["original_params"],
+            check_device=False,
+        )
+        torch.testing.assert_close(
+            rep1[step]["user"]["default"]["outer_optim"],
+            rep0[step]["user"]["default"]["outer_optim"],
+            check_device=False,
+        )
+
+
 class LocalSGDIntegTest(TestCase):
     # TODO: race condition due to using NCCL in threads causes manager allreduce to sometimes not be correct
     # Because of that the test is disabled for cuda
@@ -447,6 +474,9 @@ def test_diloco_recovery(self, use_cuda: bool) -> None:
             state_dicts = []
 
             for fut in as_completed(futures):
+                continue
+
+            for fut in futures:
                 try:
                     state_dicts.append(fut.result()[0])
                 except Exception as e:
@@ -457,33 +487,23 @@ def test_diloco_recovery(self, use_cuda: bool) -> None:
 
         rep0, rep1 = state_dicts
 
-        for step in rep0.keys():
-            # Inner optimizer and local model parameters will be different e.g.
-            # with 2 replicas r1 and r2, we sync every 2 steps
-            #
-            # - Manager Step 1
-            #   - Step 1: r1 and r2 step
-            #   - Step 2: r1 and r2 step, sync the model, quorum succeeds
-            # - Manager Step 2
-            #   - Step 1: r1 steps but r2 fails
-            #   - Step 2:
-            #     - r1 steps, sync fails because r2 is down
-            #     - r1 recovers r2 from the model state at this step
-            #       that is different from the model for r1 at the beginning
-            #       of step Manager Step 2
-            #
-            # Outer optimizer and global model should be the same
+        # Inner optimizer and local model parameters will be different e.g.
+        # with 2 replicas r1 and r2, we sync every 2 steps
+        #
+        # - Manager Step 1
+        #   - Step 1: r1 and r2 step
+        #   - Step 2: r1 and r2 step, sync the model, quorum succeeds
+        # - Manager Step 2
+        #   - Step 1: r1 steps but r2 fails
+        #   - Step 2:
+        #     - r1 steps, sync fails because r2 is down
+        #     - r1 recovers r2 from the model state at this step
+        #       that is different from the model for r1 at the beginning
+        #       of step Manager Step 2
+        #
+        # Outer optimizer and global model should be the same
+        assert_equal_global_state(rep1, rep0)
 
-            torch.testing.assert_close(
-                rep1[step]["user"]["default"]["original_params"],
-                rep0[step]["user"]["default"]["original_params"],
-                check_device=False,
-            )
-            torch.testing.assert_close(
-                rep1[step]["user"]["default"]["outer_optim"],
-                rep0[step]["user"]["default"]["outer_optim"],
-                check_device=False,
-            )
         self.assertEqual(failure_injectors[1].count, 1)
 
     # pyre-fixme[56]: Pyre was not able to infer the type of argument
@@ -552,6 +572,8 @@ def test_streaming_diloco_recovery(self, use_cuda: bool) -> None:
 
         rep0, rep1 = state_dicts
 
+        assert_equal_global_state(rep1, rep0)
+
         for step in rep1.keys():
             if step == 2:
                 # Replica 0 should have reset its `local_step` after failure
@@ -562,14 +584,93 @@ def test_streaming_diloco_recovery(self, use_cuda: bool) -> None:
                     rep0[step]["user"]["local_step"], rep1[step]["user"]["local_step"]
                 )
 
-            torch.testing.assert_close(
-                rep1[step]["user"]["default"]["original_params"],
-                rep0[step]["user"]["default"]["original_params"],
-                check_device=False,
+        self.assertEqual(failure_injectors[1].count, 1)
+
+    CONFIG: list[tuple[bool, int, int]] = [
+        (use_cuda, n_fragments, fragment_sync_delay)
+        for use_cuda in [True, False]
+        for n_fragments in [1, 2]
+        for fragment_sync_delay in [0, 1]
+    ]
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @skipIf(sys.platform == "darwin", "not reliable on mac")
+    @parameterized.expand(CONFIG)
+    def test_streaming_diloco_upscale(
+        self, use_cuda: bool, n_fragments: int, fragment_sync_delay: int
+    ) -> None:
+        # Skip the test if use_cuda is True and there are not enough GPUs
+        if use_cuda and torch.cuda.device_count() < 2:
+            self.skipTest("Not enough GPUs for CUDA test")
+
+        lighthouse = LighthouseServer(
+            bind="[::]:0",
+            min_replicas=2,
+        )
+        num_replicas = 3
+        futures = []
+        executors = []
+
+        barrier = threading.Barrier(num_replicas)
+
+        barrier_injectors = [
+            # Make this replica join after other replicas have made 2 steps
+            BarrierInjector().barrier_at(0, barrier),
+            BarrierInjector().barrier_at(2, barrier),
+            BarrierInjector().barrier_at(2, barrier),
+        ]
+
+        torch.manual_seed(42)
+        # Initialize the model so we can pass in the state_dict
+        m: nn.Module = MultiMyModel(2, 3, n_fragments)
+
+        for replica_id, barrier_injector in zip(range(num_replicas), barrier_injectors):
+            executor = ThreadPoolExecutor(max_workers=1)
+            executors.append(executor)
+            runner = Runner(
+                replica_id=replica_id,
+                num_replicas=num_replicas,
+                lighthouse_address=lighthouse.address(),
+                failure_injector=FailureInjector(),
+                barrier_injector=barrier_injector,
+                train_loop=diloco_train_loop,
+                train_loop_args={
+                    "model_state_dict": m.state_dict(),
+                    "n_fragments": n_fragments,
+                    "diloco_args": {
+                        "fragment_sync_delay": fragment_sync_delay,
+                        "sync_every": 4,
+                    },
+                },
             )
-            torch.testing.assert_close(
-                rep1[step]["user"]["default"]["outer_optim"],
-                rep0[step]["user"]["default"]["outer_optim"],
-                check_device=False,
+            futures.append(executor.submit(runner.run_replica))
+
+        state_dicts = []
+
+        for fut in as_completed(futures):
+            continue
+
+        for fut in futures:
+            try:
+                state_dicts.append(fut.result()[0])
+            except Exception as e:
+                print(e)
+                raise
+
+        lighthouse.shutdown()
+
+        rep0, rep1, rep2 = state_dicts
+
+        assert_equal_global_state(rep0, rep1)
+        assert_equal_global_state(rep0, rep2)
+
+        for step in rep0.keys():
+            self.assertEqual(
+                rep0[step]["user"]["local_step"], rep1[step]["user"]["local_step"]
             )
-        self.assertEqual(failure_injectors[1].count, 1)
+            self.assertEqual(
+                rep1[step]["user"]["local_step"], rep2[step]["user"]["local_step"]
+            )
+
+        for barrier_injector in barrier_injectors:
+            self.assertEqual(barrier_injector.count, 1)
diff --git a/torchft/manager_integ_test.py b/torchft/manager_integ_test.py
@@ -83,6 +83,30 @@ def check(self, rank: int, step: int) -> None:
                 raise InjectedFailure(f"injected failure {rank=} {step=}")
 
 
+class BarrierInjector:
+    """
+    Used to wait for all ranks and replicas to reach a certain step before continuing.
+    Users need to make sure the size of the barrier is appropriately set.
+    """
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._barriers: Dict[int, threading.Barrier] = dict()
+        self.count = 0
+
+    def barrier_at(self, step: int, barrier: threading.Barrier) -> "BarrierInjector":
+        with self._lock:
+            self._barriers[step] = barrier
+            return self
+
+    def check(self, step: int) -> None:
+        with self._lock:
+            if step in self._barriers:
+                self.count += 1
+                self._barriers[step].wait()
+                self._barriers.pop(step)
+
+
 # R for an arbitrary return type
 R = TypeVar("R", covariant=True)
 
@@ -106,6 +130,7 @@ class Runner:
     failure_injector: FailureInjector
     train_loop: TrainLoop[object]
 
+    barrier_injector: Optional[BarrierInjector] = None
     use_cuda: bool = False
     world_size: int = 1
     attempts: int = 3
@@ -223,6 +248,9 @@ def state_dict() -> Dict[str, Dict[str, object]]:
         criterion = nn.CrossEntropyLoss()
 
         while True:
+            if runner.barrier_injector is not None:
+                runner.barrier_injector.check(manager.current_step())
+
             inputs = torch.rand(2, 3)
             labels = torch.randint(4, (2,))