Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion tests/v1/spec_decode/test_eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,13 +337,19 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
"target_attn_1": mock.MagicMock(),
"target_attn_2": mock.MagicMock()
}
target_indx_layers: dict[str, mock.MagicMock] = {}
# Draft model has one extra attention layer compared to target model
all_attn_layers = {
**target_attn_layers, "draft_extra_attn": mock.MagicMock()
}

all_indx_layers: dict[str, mock.MagicMock] = {}

# Make mock_get_layers return different values for each call
mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
mock_get_layers.side_effect = [
target_attn_layers, target_indx_layers, all_attn_layers,
all_indx_layers
]

# Setup mock for pp group to return the appropriate value for world size
mock_pp_group = mock.MagicMock()
Expand Down Expand Up @@ -658,6 +664,9 @@ def create_deterministic_logits(token_ids, k: int):
# Mock runner for attention metadata building.
proposer.runner = mock.MagicMock()
proposer.runner.attn_groups.append([mock.MagicMock()])
proposer.runner.attn_groups[0][0].metadata_builders = [
attn_metadata_builder
]
proposer.runner.attn_groups[0][0].get_metadata_builder.return_value = \
attn_metadata_builder
proposer._get_attention_metadata_builder = mock.MagicMock(
Expand Down
8 changes: 7 additions & 1 deletion tests/v1/spec_decode/test_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,13 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers,

target_attn_layers = {"target_attn_1": mock.MagicMock()}
all_attn_layers = {**target_attn_layers, "draft_attn_1": mock.MagicMock()}
mock_get_layers.side_effect = [target_attn_layers, all_attn_layers]
target_indexer_layers: dict = {}
all_indexer_layers: dict = {}

mock_get_layers.side_effect = [
target_attn_layers, target_indexer_layers, all_attn_layers,
all_indexer_layers
]

mock_pp_group = mock.MagicMock()
mock_pp_group.world_size = 1
Expand Down
8 changes: 7 additions & 1 deletion vllm/config/speculative.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
@dataclass
class SpeculativeConfig:
"""Configuration for speculative decoding."""

enforce_eager: Optional[bool] = None
"""Override the default enforce_eager from model_config"""
# General speculative decoding control
num_speculative_tokens: SkipValidation[int] = None # type: ignore
"""The number of speculative tokens, if provided. It will default to the
Expand Down Expand Up @@ -219,6 +220,11 @@ def __post_init__(self):
assert (
self.target_model_config
is not None), "target_model_config must be present for mtp"
if self.target_model_config.hf_text_config.model_type \
== "deepseek_v32":
# FIXME(luccafong): cudgraph with v32 MTP is not supported,
# remove this when the issue is fixed.
self.enforce_eager = True
# use the draft model from the same model:
self.model = self.target_model_config.model
# Align the quantization of draft model for cases such as
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/attention/backends/mla/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def get_max_prefill_buffer_size(vllm_config: VllmConfig):

class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
cudagraph_support: ClassVar[AttentionCGSupport] = \
AttentionCGSupport.UNIFORM_BATCH
AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE

reorder_batch_threshold: int = 1

Expand Down
8 changes: 7 additions & 1 deletion vllm/v1/spec_decode/eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(
):
self.vllm_config = vllm_config
self.speculative_config = vllm_config.speculative_config
assert self.speculative_config is not None
self.draft_model_config = self.speculative_config.draft_model_config
self.method = self.speculative_config.method

Expand All @@ -74,11 +75,16 @@ def __init__(
vllm_config.model_config)

self.attn_metadata_builder: Optional[AttentionMetadataBuilder] = None
self.draft_indexer_metadata_builder: Optional[
AttentionMetadataBuilder] = None
self.attn_layer_names: list[str] = []
self.indexer_layer_names: list[str] = []

self.use_cuda_graph = (not current_platform.is_xpu()
and self.vllm_config.compilation_config.level
== CompilationLevel.PIECEWISE and
not self.vllm_config.model_config.enforce_eager)
not self.vllm_config.model_config.enforce_eager
and not self.speculative_config.enforce_eager)
self.cudagraph_batch_sizes = list(
reversed(self.vllm_config.compilation_config.
cudagraph_capture_sizes)) if self.use_cuda_graph else []
Expand Down