Fix long CPU->GPU synchronization during Gradient clipping (#3318)

Yasha Singh · facebook-github-bot · commit caebdb1b3ec9 · 2025-08-25T15:36:45.000-07:00
Summary: Pull Request resolved: #3318 1. Recent changes from D79128843 introduced sync point `clipping.py` which was seen in trace 2. It was creating CPU tensors which were being moved **synchronously** to cuda devices consequently causing long wait times in training with `CudaStreamSychronization` exhibiting in trace. 3. This caused QPS degradation in CTX FM model which I was actively working on optimizing and also it cause QPS degradation in most models including OmniFM that are enabling Optimizer Gradient clipping in their yaml config. 4. This fix helps bump qps by around 5% while keep NE unimpacted. Reviewed By: wz337 Differential Revision: D80959986 fbshipit-source-id: 55b0ae4165cabe4d5ce66ad442814868d408a1ac
diff --git a/torchrec/optim/clipping.py b/torchrec/optim/clipping.py
@@ -201,7 +201,7 @@ def _compute_total_norm(
     """
 
     ## compute the norm |W|^p corresponding to all sharded params W
-    sharded_grad_norm: torch.Tensor = torch.tensor(0.0)
+    sharded_grad_norm: torch.Tensor = torch.tensor(0.0, pin_memory=True)
     combine_norm_operator = torch.maximum if norm_type == torch.inf else torch.add
 
     # We need to move sharded_grad_norm to the same device as the first shard so that we can do addition (or take max)
@@ -216,7 +216,8 @@ def _compute_total_norm(
             process_groups=pgs,
         )
         sharded_grad_norm = combine_norm_operator(
-            sharded_grad_norm.to(current_shard_norm.device), current_shard_norm
+            sharded_grad_norm.to(current_shard_norm.device, non_blocking=True),
+            current_shard_norm,
         )
     # compute |W|^p corresponding to all replicate params W
     # Similar to the case above, we move replicate_grad_norm to the same device as sharded_grad_norm so that we can do addition.
@@ -226,7 +227,7 @@ def _compute_total_norm(
         )
         if replicate_grads
         else torch.tensor(0.0)
-    ).to(sharded_grad_norm.device)
+    ).to(sharded_grad_norm.device, non_blocking=True)
 
     # In the p-norm case, we are given norms |W_sharded|^p and |W_replicate|^p. To compute the total norm, we need to
     # sum them and take the p-th root. In the inf-norm case, we are given max(|W_sharded|) and max(|W_replicate|).