diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index ede87972185..36a98e54dbe 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -21,6 +21,7 @@ from paddleformers.utils.log import logger from fastdeploy import envs +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.model_executor.utils import h2d_copy, slice_fn from fastdeploy.platforms import current_platform @@ -593,7 +594,7 @@ def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer): out = multi_outs[:token_num, :] return out - def forward(self, x: paddle.Tensor, gate: nn.Layer): + def forward(self, x: paddle.Tensor, gate: nn.Layer, forward_meta: ForwardMeta): """ Defines the forward computation of the moe layer. diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index 04fa0abd09b..5fab9808dbe 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -104,7 +104,7 @@ def load_state_dict(self, state_dict): self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) - def forward(self, x): + def forward(self, x, forward_meta): """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) @@ -187,10 +187,10 @@ def load_state_dict(self, state_dict): self.experts.load_state_dict(state_dict) self.shared_experts.load_state_dict(state_dict) - def forward(self, hidden_states: paddle.Tensor): + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): """ """ - shared_experts_out = self.shared_experts(hidden_states) - moe_out = self.experts(hidden_states, self.gate) + shared_experts_out = self.shared_experts(hidden_states, forward_meta) + moe_out = self.experts(hidden_states, self.gate, forward_meta) moe_out = moe_out + shared_experts_out # We do to TP all reduce after the sum of experts. if self.tp_size > 1: @@ -514,7 +514,7 @@ def forward( hidden_states = self.self_attn(forward_meta, hidden_states, position_ids, mask_encoder_batch) hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 75947590be8..7d26764cb38 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -95,7 +95,7 @@ def load_state_dict(self, state_dict): self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) - def forward(self, hidden_states: paddle.Tensor): + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): gate_up_out = self.up_gate_proj(hidden_states) act_out = self.act_fn(gate_up_out) down_out = self.down_proj(act_out) @@ -213,10 +213,18 @@ def load_state_dict(self, state_dict): def update_state_dict(self, state_dict): self.fused_moe.load_state_dict(state_dict, True) - def forward(self, hidden_states: paddle.Tensor): - out = self.experts(hidden_states, self.gate) + def forward( + self, + hidden_states: paddle.Tensor, + forward_meta: ForwardMeta, + ): + out = self.experts( + x=hidden_states, + gate=self.gate, + forward_meta=forward_meta, + ) if self.num_shared_experts > 0: - s_x = self.shared_experts(hidden_states) + s_x = self.shared_experts(hidden_states, forward_meta) out = out + s_x return out @@ -344,7 +352,10 @@ def forward( residual, ) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp( + hidden_states=hidden_states, + forward_meta=forward_meta, + ) return hidden_states, residual @@ -609,7 +620,7 @@ def compute_logits(self, hidden_states: paddle.Tensor): return logits - def empty_input_forward(self): + def empty_input_forward(self, forward_meta): """ empty_input_forward """ @@ -621,7 +632,7 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate) + self.ernie.layers[i].mlp.experts(fake_hidden_states, self.ernie.layers[i].mlp.gate, forward_meta) def forward( self, diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 0aedb040062..2d57ed504cb 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -436,7 +436,7 @@ def compute_logits(self, hidden_states: paddle.Tensor): return logits - def empty_input_forward(self): + def empty_input_forward(self, forward_meta): """ empty_input_forward """ @@ -448,7 +448,7 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.ernie.layers[i].mlp.fused_moe(fake_hidden_states) + self.ernie.layers[i].mlp.fused_moe(hidden_states=fake_hidden_states, forward_meta=forward_meta) def forward( self, diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index a291db0e9a5..7c3685f9b22 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -169,8 +169,8 @@ def __init__( model_format="", ) - def forward(self, hidden_states: paddle.Tensor): - out = self.experts(hidden_states, self.gate) + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): + out = self.experts(hidden_states, self.gate, forward_meta) return out def load_state_dict(self, state_dict): @@ -269,9 +269,9 @@ def load_state_dict(self, state_dict): if self.num_shared_experts > 0: self.shared_experts.load_state_dict(state_dict) - def forward(self, hidden_states: paddle.Tensor, vl_moe_meta: VLMoEMeta): + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta, vl_moe_meta: VLMoEMeta): if self.num_shared_experts > 0: - shared_experts_out = self.shared_experts(hidden_states) + shared_experts_out = self.shared_experts(hidden_states, forward_meta) hidden_states, text_input, image_input = text_image_gather_scatter( hidden_states, vl_moe_meta.text_input, @@ -281,8 +281,8 @@ def forward(self, hidden_states: paddle.Tensor, vl_moe_meta: VLMoEMeta): vl_moe_meta.image_index, True, ) - text_out = self.text_fused_moe(text_input) - image_out = self.image_fused_moe(image_input) + text_out = self.text_fused_moe(text_input, forward_meta) + image_out = self.image_fused_moe(image_input, forward_meta) hidden_states, _, _ = text_image_gather_scatter( hidden_states, text_out, @@ -388,9 +388,9 @@ def forward( hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) if isinstance(self.mlp, Ernie4_5_VLMoE): - hidden_states = self.mlp(hidden_states, vl_moe_meta) + hidden_states = self.mlp(hidden_states, forward_meta, vl_moe_meta) else: - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual @@ -745,7 +745,7 @@ def compute_logits(self, hidden_states: paddle.Tensor): return logits - def empty_input_forward(self): + def empty_input_forward(self, forward_meta): """ empty_input_forward """ @@ -757,8 +757,8 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states) - self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states) + self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states, forward_meta) + self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states, forward_meta) def get_input_embeddings( self, diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index 8850ce81243..c18762d49d4 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -85,7 +85,7 @@ def __init__( act_method=fd_config.model_config.hidden_act, ) - def forward(self, x): + def forward(self, x, forward_meta): """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) @@ -161,9 +161,9 @@ def __init__( reduce_results=False, ) - def forward(self, x): + def forward(self, x, forward_meta): shared_experts_out = self.shared_experts(x) - out = self.experts(x, self.gate) + out = self.experts(x, self.gate, forward_meta) out = out + shared_experts_out # We do to TP all reduce after the sum of experts. if self.tensor_parallel_size > 1: @@ -306,7 +306,10 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp( + hidden_states, + forward_meta, + ) return hidden_states, residual diff --git a/fastdeploy/model_executor/models/gpt_oss.py b/fastdeploy/model_executor/models/gpt_oss.py index e951fff92f5..682c9f5f1ec 100644 --- a/fastdeploy/model_executor/models/gpt_oss.py +++ b/fastdeploy/model_executor/models/gpt_oss.py @@ -124,8 +124,8 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = ""): model_format="", ) - def forward(self, hidden_states: paddle.Tensor): - expert_output = self.experts(hidden_states, self.router) + def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): + expert_output = self.experts(hidden_states, self.router, forward_meta) return expert_output @@ -173,7 +173,7 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index d49f3a32705..e56423c9136 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -90,7 +90,7 @@ def load_state_dict(self, state_dict): self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) - def forward(self, x): + def forward(self, x, forward_meta): """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) @@ -206,7 +206,7 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 9537b84f22c..5f67ba75b99 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -79,8 +79,8 @@ def __init__( weight_dtype="float32", ) - def forward(self, x): - return self.experts(x, self.gate) + def forward(self, x, forward_meta): + return self.experts(x, self.gate, forward_meta) def load_state_dict(self, state_dict): """ """ @@ -127,7 +127,7 @@ def load_state_dict(self, state_dict): self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) - def forward(self, x): + def forward(self, x, forward_meta): """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) @@ -206,7 +206,7 @@ def forward( # Fully Connected hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) - hidden_states = self.mlp(hidden_states) + hidden_states = self.mlp(hidden_states, forward_meta) return hidden_states, residual @@ -418,7 +418,7 @@ def compute_logits(self, hidden_states: paddle.Tensor): return logits - def empty_input_forward(self): + def empty_input_forward(self, forward_meta): """ empty_input_forward """ @@ -430,7 +430,7 @@ def empty_input_forward(self): self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers, ): - self.model.layers[i].mlp.experts(fake_hidden_states, self.model.layers[i].mlp.gate) + self.model.layers[i].mlp.experts(fake_hidden_states, self.model.layers[i].mlp.gate, forward_meta) def forward( self, diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 3b40c8c164f..be9f90187bd 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -904,7 +904,7 @@ def _propose(self, step_use_cudagraph: bool = False): self._get_self_hidden_states(hidden_states) else: if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(self.forward_meta) def _get_self_hidden_states(self, hidden_states): target_hidden_states = eagle_get_self_hidden_states( diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 3444cc7dd1f..6bd8da02b24 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -971,7 +971,7 @@ class at the server level, which is too granular for ModelRunner. # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, # when there is data on other runner, the current runner is required to execute part of the model. if not self.not_need_stop(): - self._execute_empty_input() + self._execute_empty_input(self.forward_meta) return None # 1. Prepare inputs of model and sampler. @@ -1088,14 +1088,14 @@ class at the server level, which is too granular for ModelRunner. self.seq_lens_this_time_buffer.copy_(self.share_inputs["seq_lens_this_time"], False) return None - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index e547da97df7..404367cc803 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2087,7 +2087,7 @@ class at the server level, which is too granular for ModelRunner. # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, # when there is data on other runner, the current runner is required to execute part of the model. if not self.not_need_stop(): - self._execute_empty_input() + self._execute_empty_input(self.forward_meta) return None # 2. Padding inputs for cuda graph @@ -2349,14 +2349,14 @@ def _pool(self, hidden_states: paddle.Tensor, num_running_requests: int) -> Opti return pooler_output - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py index b86d680f08f..b7363dfdbf3 100644 --- a/fastdeploy/worker/hpu_model_runner.py +++ b/fastdeploy/worker/hpu_model_runner.py @@ -1345,14 +1345,14 @@ class at the server level, which is too granular for ModelRunner. self.prof.step() return None - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index b346f7be6ab..3038a34fc2b 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -1812,7 +1812,7 @@ class at the server level, which is too granular for ModelRunner. # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, # when there is data on other runner, the current runner is required to execute part of the model. if not self.not_need_stop(): - self._execute_empty_input() + self._execute_empty_input(self.forward_meta) return None # 2. Padding inputs for cuda graph @@ -1998,14 +1998,14 @@ class at the server level, which is too granular for ModelRunner. ) return None - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 4ab4ee2ff3c..d337225b178 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -1159,7 +1159,7 @@ class at the server level, which is too granular for ModelRunner. # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode, # when there is data on other runner, the current runner is required to execute part of the model. if not self.not_need_stop() and not is_dummy_run: - self._execute_empty_input() + self._execute_empty_input(self.forward_meta) return None # 2. Padding inputs for cuda grph @@ -1231,14 +1231,14 @@ class at the server level, which is too granular for ModelRunner. return None - def _execute_empty_input(self) -> None: + def _execute_empty_input(self, forward_meta) -> None: """ In certain scenarios, such as during EP, the runner needs to execute partial modules of the model without input data. This requires the model to implement the `empty_input_forward` method. """ if hasattr(self.model, "empty_input_forward"): - self.model.empty_input_forward() + self.model.empty_input_forward(forward_meta) else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")