diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index ede87972185..36a98e54dbe 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -21,6 +21,7 @@
 from paddleformers.utils.log import logger
 
 from fastdeploy import envs
+from fastdeploy.model_executor.forward_meta import ForwardMeta
 from fastdeploy.model_executor.layers.utils import get_tensor
 from fastdeploy.model_executor.utils import h2d_copy, slice_fn
 from fastdeploy.platforms import current_platform
@@ -593,7 +594,7 @@ def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer):
         out = multi_outs[:token_num, :]
         return out
 
-    def forward(self, x: paddle.Tensor, gate: nn.Layer):
+    def forward(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta):
         """
         Defines the forward computation of the moe layer.
 
diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py
index 04fa0abd09b..5fab9808dbe 100644
--- a/fastdeploy/model_executor/models/deepseek_v3.py
+++ b/fastdeploy/model_executor/models/deepseek_v3.py
@@ -104,7 +104,7 @@ def load_state_dict(self, state_dict):
         self.up_gate_proj.load_state_dict(state_dict)
         self.down_proj.load_state_dict(state_dict)
 
-    def forward(self, x):
+    def forward(self, x, forward_meta):
         """ """
         gate_up_out = self.up_gate_proj(x)
         act_out = self.act_fn(gate_up_out)
@@ -187,10 +187,10 @@ def load_state_dict(self, state_dict):
         self.experts.load_state_dict(state_dict)
         self.shared_experts.load_state_dict(state_dict)
 
-    def forward(self, hidden_states: paddle.Tensor):
+    def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta):
         """ """
-        shared_experts_out = self.shared_experts(hidden_states)
-        moe_out = self.experts(hidden_states, self.gate)
+        shared_experts_out = self.shared_experts(hidden_states, forward_meta)
+        moe_out = self.experts(hidden_states, self.gate, forward_meta)
         moe_out = moe_out + shared_experts_out
         # We do to TP all reduce after the sum of experts.
         if self.tp_size > 1:
@@ -514,7 +514,7 @@ def forward(
         hidden_states = self.self_attn(forward_meta, hidden_states, position_ids, mask_encoder_batch)
 
         hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(hidden_states, forward_meta)
         return hidden_states, residual
 
 
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
index 75947590be8..7d26764cb38 100644
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -95,7 +95,7 @@ def load_state_dict(self, state_dict):
         self.up_gate_proj.load_state_dict(state_dict)
         self.down_proj.load_state_dict(state_dict)
 
-    def forward(self, hidden_states: paddle.Tensor):
+    def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta):
         gate_up_out = self.up_gate_proj(hidden_states)
         act_out = self.act_fn(gate_up_out)
         down_out = self.down_proj(act_out)
@@ -213,10 +213,18 @@ def load_state_dict(self, state_dict):
     def update_state_dict(self, state_dict):
         self.fused_moe.load_state_dict(state_dict, True)
 
-    def forward(self, hidden_states: paddle.Tensor):
-        out = self.experts(hidden_states, self.gate)
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        forward_meta: ForwardMeta,
+    ):
+        out = self.experts(
+            x=hidden_states,
+            gate=self.gate,
+            forward_meta=forward_meta,
+        )
         if self.num_shared_experts > 0:
-            s_x = self.shared_experts(hidden_states)
+            s_x = self.shared_experts(hidden_states, forward_meta)
             out = out + s_x
         return out
 
@@ -344,7 +352,10 @@ def forward(
             residual,
         )
 
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(
+            hidden_states=hidden_states,
+            forward_meta=forward_meta,
+        )
 
         return hidden_states, residual
 
@@ -609,7 +620,7 @@ def compute_logits(self, hidden_states: paddle.Tensor):
 
         return logits
 
-    def empty_input_forward(self):
+    def empty_input_forward(self, forward_meta):
         """
         empty_input_forward
         """
@@ -621,7 +632,7 @@ def empty_input_forward(self):
             self.fd_config.model_config.moe_layer_start_index,
             self.fd_config.model_config.num_hidden_layers,
         ):
-            self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate)
+            self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate, forward_meta)
 
     def forward(
         self,
diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py
index 0aedb040062..2d57ed504cb 100644
--- a/fastdeploy/model_executor/models/ernie4_5_mtp.py
+++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py
@@ -436,7 +436,7 @@ def compute_logits(self, hidden_states: paddle.Tensor):
 
         return logits
 
-    def empty_input_forward(self):
+    def empty_input_forward(self, forward_meta):
         """
         empty_input_forward
         """
@@ -448,7 +448,7 @@ def empty_input_forward(self):
             self.fd_config.model_config.moe_layer_start_index,
             self.fd_config.model_config.num_hidden_layers,
         ):
-            self.ernie.layers[i].mlp.fused_moe(fake_hidden_states)
+            self.ernie.layers[i].mlp.fused_moe(hidden_states=fake_hidden_states, forward_meta=forward_meta)
 
     def forward(
         self,
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
index a291db0e9a5..7c3685f9b22 100644
--- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -169,8 +169,8 @@ def __init__(
             model_format="",
         )
 
-    def forward(self, hidden_states: paddle.Tensor):
-        out = self.experts(hidden_states, self.gate)
+    def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta):
+        out = self.experts(hidden_states, self.gate, forward_meta)
         return out
 
     def load_state_dict(self, state_dict):
@@ -269,9 +269,9 @@ def load_state_dict(self, state_dict):
         if self.num_shared_experts > 0:
             self.shared_experts.load_state_dict(state_dict)
 
-    def forward(self, hidden_states: paddle.Tensor, vl_moe_meta: VLMoEMeta):
+    def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta, vl_moe_meta: VLMoEMeta):
         if self.num_shared_experts > 0:
-            shared_experts_out = self.shared_experts(hidden_states)
+            shared_experts_out = self.shared_experts(hidden_states, forward_meta)
         hidden_states, text_input, image_input = text_image_gather_scatter(
             hidden_states,
             vl_moe_meta.text_input,
@@ -281,8 +281,8 @@ def forward(self, hidden_states: paddle.Tensor, vl_moe_meta: VLMoEMeta):
             vl_moe_meta.image_index,
             True,
         )
-        text_out = self.text_fused_moe(text_input)
-        image_out = self.image_fused_moe(image_input)
+        text_out = self.text_fused_moe(text_input, forward_meta)
+        image_out = self.image_fused_moe(image_input, forward_meta)
         hidden_states, _, _ = text_image_gather_scatter(
             hidden_states,
             text_out,
@@ -388,9 +388,9 @@ def forward(
         hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
 
         if isinstance(self.mlp, Ernie4_5_VLMoE):
-            hidden_states = self.mlp(hidden_states, vl_moe_meta)
+            hidden_states = self.mlp(hidden_states, forward_meta, vl_moe_meta)
         else:
-            hidden_states = self.mlp(hidden_states)
+            hidden_states = self.mlp(hidden_states, forward_meta)
 
         return hidden_states, residual
 
@@ -745,7 +745,7 @@ def compute_logits(self, hidden_states: paddle.Tensor):
 
         return logits
 
-    def empty_input_forward(self):
+    def empty_input_forward(self, forward_meta):
         """
         empty_input_forward
         """
@@ -757,8 +757,8 @@ def empty_input_forward(self):
             self.fd_config.model_config.moe_layer_start_index,
             self.fd_config.model_config.num_hidden_layers,
         ):
-            self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states)
-            self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states)
+            self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states, forward_meta)
+            self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states, forward_meta)
 
     def get_input_embeddings(
         self,
diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py
index 8850ce81243..c18762d49d4 100644
--- a/fastdeploy/model_executor/models/glm4_moe.py
+++ b/fastdeploy/model_executor/models/glm4_moe.py
@@ -85,7 +85,7 @@ def __init__(
             act_method=fd_config.model_config.hidden_act,
         )
 
-    def forward(self, x):
+    def forward(self, x, forward_meta):
         """ """
         gate_up_out = self.up_gate_proj(x)
         act_out = self.act_fn(gate_up_out)
@@ -161,9 +161,9 @@ def __init__(
             reduce_results=False,
         )
 
-    def forward(self, x):
+    def forward(self, x, forward_meta):
         shared_experts_out = self.shared_experts(x)
-        out = self.experts(x, self.gate)
+        out = self.experts(x, self.gate, forward_meta)
         out = out + shared_experts_out
         # We do to TP all reduce after the sum of experts.
         if self.tensor_parallel_size > 1:
@@ -306,7 +306,10 @@ def forward(
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
 
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(
+            hidden_states,
+            forward_meta,
+        )
 
         return hidden_states, residual
 
diff --git a/fastdeploy/model_executor/models/gpt_oss.py b/fastdeploy/model_executor/models/gpt_oss.py
index e951fff92f5..682c9f5f1ec 100644
--- a/fastdeploy/model_executor/models/gpt_oss.py
+++ b/fastdeploy/model_executor/models/gpt_oss.py
@@ -124,8 +124,8 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = ""):
             model_format="",
         )
 
-    def forward(self, hidden_states: paddle.Tensor):
-        expert_output = self.experts(hidden_states, self.router)
+    def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta):
+        expert_output = self.experts(hidden_states, self.router, forward_meta)
         return expert_output
 
 
@@ -173,7 +173,7 @@ def forward(
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
 
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(hidden_states, forward_meta)
         return hidden_states, residual
 
 
diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py
index d49f3a32705..e56423c9136 100644
--- a/fastdeploy/model_executor/models/qwen2.py
+++ b/fastdeploy/model_executor/models/qwen2.py
@@ -90,7 +90,7 @@ def load_state_dict(self, state_dict):
         self.up_gate_proj.load_state_dict(state_dict)
         self.down_proj.load_state_dict(state_dict)
 
-    def forward(self, x):
+    def forward(self, x, forward_meta):
         """ """
         gate_up_out = self.up_gate_proj(x)
         act_out = self.act_fn(gate_up_out)
@@ -206,7 +206,7 @@ def forward(
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
 
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(hidden_states, forward_meta)
 
         return hidden_states, residual
 
diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py
index 9537b84f22c..5f67ba75b99 100644
--- a/fastdeploy/model_executor/models/qwen3moe.py
+++ b/fastdeploy/model_executor/models/qwen3moe.py
@@ -79,8 +79,8 @@ def __init__(
             weight_dtype="float32",
         )
 
-    def forward(self, x):
-        return self.experts(x, self.gate)
+    def forward(self, x, forward_meta):
+        return self.experts(x, self.gate, forward_meta)
 
     def load_state_dict(self, state_dict):
         """ """
@@ -127,7 +127,7 @@ def load_state_dict(self, state_dict):
         self.up_gate_proj.load_state_dict(state_dict)
         self.down_proj.load_state_dict(state_dict)
 
-    def forward(self, x):
+    def forward(self, x, forward_meta):
         """ """
         gate_up_out = self.up_gate_proj(x)
         act_out = self.act_fn(gate_up_out)
@@ -206,7 +206,7 @@ def forward(
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
 
-        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(hidden_states, forward_meta)
 
         return hidden_states, residual
 
@@ -418,7 +418,7 @@ def compute_logits(self, hidden_states: paddle.Tensor):
 
         return logits
 
-    def empty_input_forward(self):
+    def empty_input_forward(self, forward_meta):
         """
         empty_input_forward
         """
@@ -430,7 +430,7 @@ def empty_input_forward(self):
             self.fd_config.model_config.moe_layer_start_index,
             self.fd_config.model_config.num_hidden_layers,
         ):
-            self.model.layers[i].mlp.experts(fake_hidden_states, self.model.layers[i].mlp.gate)
+            self.model.layers[i].mlp.experts(fake_hidden_states, self.model.layers[i].mlp.gate, forward_meta)
 
     def forward(
         self,
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
index 3b40c8c164f..be9f90187bd 100644
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -904,7 +904,7 @@ def _propose(self, step_use_cudagraph: bool = False):
                     self._get_self_hidden_states(hidden_states)
             else:
                 if hasattr(self.model, "empty_input_forward"):
-                    self.model.empty_input_forward()
+                    self.model.empty_input_forward(self.forward_meta)
 
     def _get_self_hidden_states(self, hidden_states):
         target_hidden_states = eagle_get_self_hidden_states(
diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
index 3444cc7dd1f..6bd8da02b24 100644
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -971,7 +971,7 @@ class at the server level, which is too granular for ModelRunner.
         # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
         # when there is data on other runner, the current runner is required to execute part of the model.
         if not self.not_need_stop():
-            self._execute_empty_input()
+            self._execute_empty_input(self.forward_meta)
             return None
 
         # 1. Prepare inputs of model and sampler.
@@ -1088,14 +1088,14 @@ class at the server level, which is too granular for ModelRunner.
         self.seq_lens_this_time_buffer.copy_(self.share_inputs["seq_lens_this_time"], False)
         return None
 
-    def _execute_empty_input(self) -> None:
+    def _execute_empty_input(self, forward_meta) -> None:
         """
         In certain scenarios, such as during EP,
         the runner needs to execute partial modules of the model without input data.
         This requires the model to implement the `empty_input_forward` method.
         """
         if hasattr(self.model, "empty_input_forward"):
-            self.model.empty_input_forward()
+            self.model.empty_input_forward(forward_meta)
         else:
             raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
 
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index e547da97df7..404367cc803 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -2087,7 +2087,7 @@ class at the server level, which is too granular for ModelRunner.
         # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
         # when there is data on other runner, the current runner is required to execute part of the model.
         if not self.not_need_stop():
-            self._execute_empty_input()
+            self._execute_empty_input(self.forward_meta)
             return None
 
         # 2. Padding inputs for cuda graph
@@ -2349,14 +2349,14 @@ def _pool(self, hidden_states: paddle.Tensor, num_running_requests: int) -> Opti
 
         return pooler_output
 
-    def _execute_empty_input(self) -> None:
+    def _execute_empty_input(self, forward_meta) -> None:
         """
         In certain scenarios, such as during EP,
         the runner needs to execute partial modules of the model without input data.
         This requires the model to implement the `empty_input_forward` method.
         """
         if hasattr(self.model, "empty_input_forward"):
-            self.model.empty_input_forward()
+            self.model.empty_input_forward(forward_meta)
         else:
             raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
 
diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py
index b86d680f08f..b7363dfdbf3 100644
--- a/fastdeploy/worker/hpu_model_runner.py
+++ b/fastdeploy/worker/hpu_model_runner.py
@@ -1345,14 +1345,14 @@ class at the server level, which is too granular for ModelRunner.
             self.prof.step()
         return None
 
-    def _execute_empty_input(self) -> None:
+    def _execute_empty_input(self, forward_meta) -> None:
         """
         In certain scenarios, such as during EP,
         the runner needs to execute partial modules of the model without input data.
         This requires the model to implement the `empty_input_forward` method.
         """
         if hasattr(self.model, "empty_input_forward"):
-            self.model.empty_input_forward()
+            self.model.empty_input_forward(forward_meta)
         else:
             raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
 
diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py
index b346f7be6ab..3038a34fc2b 100644
--- a/fastdeploy/worker/metax_model_runner.py
+++ b/fastdeploy/worker/metax_model_runner.py
@@ -1812,7 +1812,7 @@ class at the server level, which is too granular for ModelRunner.
         # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
         # when there is data on other runner, the current runner is required to execute part of the model.
         if not self.not_need_stop():
-            self._execute_empty_input()
+            self._execute_empty_input(self.forward_meta)
             return None
 
         # 2. Padding inputs for cuda graph
@@ -1998,14 +1998,14 @@ class at the server level, which is too granular for ModelRunner.
         )
         return None
 
-    def _execute_empty_input(self) -> None:
+    def _execute_empty_input(self, forward_meta) -> None:
         """
         In certain scenarios, such as during EP,
         the runner needs to execute partial modules of the model without input data.
         This requires the model to implement the `empty_input_forward` method.
         """
         if hasattr(self.model, "empty_input_forward"):
-            self.model.empty_input_forward()
+            self.model.empty_input_forward(forward_meta)
         else:
             raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
 
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
index 4ab4ee2ff3c..d337225b178 100644
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -1159,7 +1159,7 @@ class at the server level, which is too granular for ModelRunner.
         # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
         # when there is data on other runner, the current runner is required to execute part of the model.
         if not self.not_need_stop() and not is_dummy_run:
-            self._execute_empty_input()
+            self._execute_empty_input(self.forward_meta)
             return None
 
         # 2. Padding inputs for cuda grph
@@ -1231,14 +1231,14 @@ class at the server level, which is too granular for ModelRunner.
 
         return None
 
-    def _execute_empty_input(self) -> None:
+    def _execute_empty_input(self, forward_meta) -> None:
         """
         In certain scenarios, such as during EP,
         the runner needs to execute partial modules of the model without input data.
         This requires the model to implement the `empty_input_forward` method.
         """
         if hasattr(self.model, "empty_input_forward"):
-            self.model.empty_input_forward()
+            self.model.empty_input_forward(forward_meta)
         else:
             raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")