From dcf59c71b1d84c4212c45fa250878b89d8bb9a57 Mon Sep 17 00:00:00 2001 From: Shobhit Verma Date: Thu, 24 Apr 2025 23:30:35 -0700 Subject: [PATCH 1/3] Skip layernorm if set to None --- tensorrt_llm/_torch/models/modeling_llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index eab023a3b..ee6e0bda1 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -464,7 +464,7 @@ def forward( spec_metadata.maybe_capture_hidden_states(self.layer_idx, hidden_states, residual) - if self.fusion_config.POST_MOE_FUSION or self.fusion_config.POST_MLP_FUSION: + if (self.fusion_config.POST_MOE_FUSION or self.fusion_config.POST_MLP_FUSION) and self.next_layer_layernorm is not None: if min_latency_mode: shared_output = hidden_states[0] hidden_states_activated_experts = hidden_states[1] From 31f25cba85d4af60baeb78e6ebcd886a2ce7328c Mon Sep 17 00:00:00 2001 From: Shobhit Verma Date: Thu, 24 Apr 2025 23:34:42 -0700 Subject: [PATCH 2/3] enable moe/mlp fusion --- tensorrt_llm/_torch/models/modeling_llama.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index ee6e0bda1..4d5d84a23 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -347,8 +347,8 @@ def __init__( # self.fusion_config.PRE_MOE_FUSION = model_config.mapping.has_tp( # ) # TODO: re-enable these fusions - self.fusion_config.PRE_MOE_FUSION = False - self.fusion_config.POST_MLP_FUSION = False + # self.fusion_config.PRE_MOE_FUSION = False + # self.fusion_config.POST_MLP_FUSION = False self.self_attn = Llama4Attention( model_config, @@ -374,6 +374,9 @@ def __init__( # self.fusion_config.POST_MLP_FUSION = model_config.mapping.has_tp( # ) + self.fusion_config.PRE_MLP_FUSION = model_config.mapping.has_tp() + self.fusion_config.POST_MLP_FUSION = model_config.mapping.has_tp() + else: self.feed_forward = Llama4MoE( num_experts=config.num_local_experts, @@ -385,6 +388,10 @@ def __init__( aux_stream=aux_stream, dtype=config.torch_dtype) + self.fusion_config.PRE_MOE_FUSION = model_config.mapping.has_tp() + self.fusion_config.POST_MOE_FUSION = model_config.mapping.has_tp() + + # self.fusion_config.POST_MOE_FUSION = model_config.mapping.has_tp( # ) From d7cc5082bfa2fed256abeacf2511dc7e3cab6686 Mon Sep 17 00:00:00 2001 From: Shobhit Verma Date: Thu, 24 Apr 2025 23:35:47 -0700 Subject: [PATCH 3/3] Revert "enable moe/mlp fusion" This reverts commit 6e175ce047c10302bd435d25af20f2892cb14bee. --- tensorrt_llm/_torch/models/modeling_llama.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py index 4d5d84a23..ee6e0bda1 100644 --- a/tensorrt_llm/_torch/models/modeling_llama.py +++ b/tensorrt_llm/_torch/models/modeling_llama.py @@ -347,8 +347,8 @@ def __init__( # self.fusion_config.PRE_MOE_FUSION = model_config.mapping.has_tp( # ) # TODO: re-enable these fusions - # self.fusion_config.PRE_MOE_FUSION = False - # self.fusion_config.POST_MLP_FUSION = False + self.fusion_config.PRE_MOE_FUSION = False + self.fusion_config.POST_MLP_FUSION = False self.self_attn = Llama4Attention( model_config, @@ -374,9 +374,6 @@ def __init__( # self.fusion_config.POST_MLP_FUSION = model_config.mapping.has_tp( # ) - self.fusion_config.PRE_MLP_FUSION = model_config.mapping.has_tp() - self.fusion_config.POST_MLP_FUSION = model_config.mapping.has_tp() - else: self.feed_forward = Llama4MoE( num_experts=config.num_local_experts, @@ -388,10 +385,6 @@ def __init__( aux_stream=aux_stream, dtype=config.torch_dtype) - self.fusion_config.PRE_MOE_FUSION = model_config.mapping.has_tp() - self.fusion_config.POST_MOE_FUSION = model_config.mapping.has_tp() - - # self.fusion_config.POST_MOE_FUSION = model_config.mapping.has_tp( # )