Use torch compile to fuse weight scaling and multistream

chang-l · chang-l · commit e26d4e97de8a · 2025-11-07T16:45:08.000-08:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -17,6 +17,7 @@
     maybe_execute_in_parallel
 from tensorrt_llm._torch.modules.rotary_embedding import RotaryEmbedding
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
+from tensorrt_llm._torch.utils import maybe_compile
 from tensorrt_llm._utils import get_size_in_bytes
 from tensorrt_llm.bindings import DataType
 from tensorrt_llm.bindings.executor import KvCacheConfig
@@ -572,6 +573,11 @@ def update_for_spec_dec(self):
         self.on_update_kv_lens()
 
 
+@maybe_compile(dynamic=True)
+def _scale(weights, q_scale, s):
+    return weights * q_scale.squeeze(-1) * s
+
+
 class Indexer(nn.Module):
 
     def __init__(self,
@@ -962,9 +968,6 @@ def sparse_attn_indexer(
             device=hidden_states.device)
         topk_indices_buffer[:hidden_states.shape[0]] = -1
 
-        # Store k_fp8 and k_scale into indexer k cache
-        self._update_k_cache(k_fp8, k_scale, metadata)
-
         if has_prefill:
             # Use chunked prefill to reduce memory footprint
             if metadata.indexer_prefill_chunks is not None:
@@ -1099,9 +1102,7 @@ def weight_scale(self, hidden_states: torch.Tensor,
                      q_scale: torch.Tensor) -> torch.Tensor:
         weights = indexer_weights if indexer_weights is not None else self.weights_proj(
             hidden_states)
-        weights = weights.unsqueeze(-1) * q_scale * self.weight_scale_factor
-        # output weights is guaranteed to be float32 due to type promotion from q_scale (float32)
-        weights = weights.squeeze(-1)
+        weights = _scale(weights, q_scale, self.weight_scale_factor)
         return weights
 
     @torch.inference_mode()
@@ -1170,7 +1171,15 @@ def _prep_q_or_k(qk_pe, qk_nope):
         q_fp8 = q_fp8.view(-1, self.n_heads, self.head_dim)
         q_scale = q_scale.view(-1, self.n_heads, 1)
 
-        weights = self.weight_scale(hidden_states, indexer_weights, q_scale)
+        weights, _ = maybe_execute_in_parallel(
+            lambda: self.weight_scale(hidden_states, indexer_weights, q_scale),
+            lambda: self._update_k_cache(
+                k_fp8, k_scale, metadata),  # store k_fp8 and k_scale in k cache
+            self.ln_events[0],
+            self.ln_events[1],
+            self.aux_stream,
+        )
+
         # Return topk indices buffer for sparse attention [num_tokens, index_topk]
         return self.sparse_attn_indexer(metadata, hidden_states, q_fp8, k_fp8,
                                         k_scale, weights)
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -23,7 +23,7 @@
 from ..model_config import ModelConfig
 from ..peft.lora.layer import LoraLayer, LoraModuleType
 from ..utils import (Fp4QuantizedTensor, get_model_extra_attrs,
-                     is_piecewise_running, is_torch_compiling)
+                     is_torch_compiling, maybe_compile)
 from .linear import Linear, TensorParallelMode, WeightMode, WeightsLoadingConfig
 from .multi_stream_utils import maybe_execute_in_parallel
 from .rms_norm import RMSNorm
@@ -76,17 +76,6 @@ def extract_extra_attrs(layer_idx: str, attn_type: str):
     return metadata, attn_layer
 
 
-def maybe_compile(func):
-
-    def wrapper(*args, **kwargs):
-        if is_piecewise_running():
-            # When piecewise running, we don't need to compile the function to avoid host overhead in attention op.
-            return func(*args, **kwargs)
-        return torch.compile(func)(*args, **kwargs)
-
-    return wrapper
-
-
 @maybe_compile
 def maybe_compiled_copy_(dst, src):
     dst.copy_(src)
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -325,3 +325,17 @@ def get_device_uuid(device_idx: int) -> str:
     property = torch.cuda.get_device_properties(device_idx)
     uuid = "GPU-" + str(property.uuid)
     return uuid
+
+
+def maybe_compile(func=None, **compile_kwargs):
+
+    def decorator(f):
+
+        def wrapper(*args, **kwargs):
+            if is_piecewise_running():
+                return f(*args, **kwargs)
+            return torch.compile(f, **compile_kwargs)(*args, **kwargs)
+
+        return wrapper
+
+    return decorator(func) if func else decorator