From dcf59c71b1d84c4212c45fa250878b89d8bb9a57 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 24 Apr 2025 23:30:35 -0700
Subject: [PATCH 1/3] Skip layernorm if set to None

---
 tensorrt_llm/_torch/models/modeling_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index eab023a3b..ee6e0bda1 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -464,7 +464,7 @@ def forward(
             spec_metadata.maybe_capture_hidden_states(self.layer_idx,
                                                       hidden_states, residual)
 
-        if self.fusion_config.POST_MOE_FUSION or self.fusion_config.POST_MLP_FUSION:
+        if (self.fusion_config.POST_MOE_FUSION or self.fusion_config.POST_MLP_FUSION) and self.next_layer_layernorm is not None:
             if min_latency_mode:
                 shared_output = hidden_states[0]
                 hidden_states_activated_experts = hidden_states[1]

From 31f25cba85d4af60baeb78e6ebcd886a2ce7328c Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 24 Apr 2025 23:34:42 -0700
Subject: [PATCH 2/3] enable moe/mlp fusion

---
 tensorrt_llm/_torch/models/modeling_llama.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index ee6e0bda1..4d5d84a23 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -347,8 +347,8 @@ def __init__(
         # self.fusion_config.PRE_MOE_FUSION = model_config.mapping.has_tp(
         # )
         # TODO: re-enable these fusions
-        self.fusion_config.PRE_MOE_FUSION = False
-        self.fusion_config.POST_MLP_FUSION = False
+        # self.fusion_config.PRE_MOE_FUSION = False
+        # self.fusion_config.POST_MLP_FUSION = False
 
         self.self_attn = Llama4Attention(
             model_config,
@@ -374,6 +374,9 @@ def __init__(
 
             # self.fusion_config.POST_MLP_FUSION = model_config.mapping.has_tp(
             # )
+            self.fusion_config.PRE_MLP_FUSION = model_config.mapping.has_tp()
+            self.fusion_config.POST_MLP_FUSION = model_config.mapping.has_tp()
+
         else:
             self.feed_forward = Llama4MoE(
                 num_experts=config.num_local_experts,
@@ -385,6 +388,10 @@ def __init__(
                 aux_stream=aux_stream,
                 dtype=config.torch_dtype)
 
+            self.fusion_config.PRE_MOE_FUSION = model_config.mapping.has_tp()
+            self.fusion_config.POST_MOE_FUSION = model_config.mapping.has_tp()
+
+
             # self.fusion_config.POST_MOE_FUSION = model_config.mapping.has_tp(
             # )
 

From d7cc5082bfa2fed256abeacf2511dc7e3cab6686 Mon Sep 17 00:00:00 2001
From: Shobhit Verma <shobhitv@nvidia.com>
Date: Thu, 24 Apr 2025 23:35:47 -0700
Subject: [PATCH 3/3] Revert "enable moe/mlp fusion"

This reverts commit 6e175ce047c10302bd435d25af20f2892cb14bee.
---
 tensorrt_llm/_torch/models/modeling_llama.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index 4d5d84a23..ee6e0bda1 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -347,8 +347,8 @@ def __init__(
         # self.fusion_config.PRE_MOE_FUSION = model_config.mapping.has_tp(
         # )
         # TODO: re-enable these fusions
-        # self.fusion_config.PRE_MOE_FUSION = False
-        # self.fusion_config.POST_MLP_FUSION = False
+        self.fusion_config.PRE_MOE_FUSION = False
+        self.fusion_config.POST_MLP_FUSION = False
 
         self.self_attn = Llama4Attention(
             model_config,
@@ -374,9 +374,6 @@ def __init__(
 
             # self.fusion_config.POST_MLP_FUSION = model_config.mapping.has_tp(
             # )
-            self.fusion_config.PRE_MLP_FUSION = model_config.mapping.has_tp()
-            self.fusion_config.POST_MLP_FUSION = model_config.mapping.has_tp()
-
         else:
             self.feed_forward = Llama4MoE(
                 num_experts=config.num_local_experts,
@@ -388,10 +385,6 @@ def __init__(
                 aux_stream=aux_stream,
                 dtype=config.torch_dtype)
 
-            self.fusion_config.PRE_MOE_FUSION = model_config.mapping.has_tp()
-            self.fusion_config.POST_MOE_FUSION = model_config.mapping.has_tp()
-
-
             # self.fusion_config.POST_MOE_FUSION = model_config.mapping.has_tp(
             # )