vllm-project · AlonKellner-RedHat · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/.gitignore b/.gitignore
@@ -25,3 +25,12 @@ htmlcov/
 .vscode/
 *.swp
 .DS_Store
+
+# Benchmarks (results should not be tracked)
+benchmarks/
+benchmarks.csv
+benchmarks.json
+
+# Temporary values files
+*-values.yaml
+values.yaml
diff --git a/dllm_plugin/__init__.py b/dllm_plugin/__init__.py
@@ -51,13 +51,18 @@ def __getattr__(name: str):
 def register_dllm() -> None:
     """Entry point for ``vllm.general_plugins`` (``dllm``).
 
-    When ``vllm`` is importable, registers **two** architecture names with
-    ``ModelRegistry``, both pointing at the same **mock** implementation for
-    Phases 2–6 stack testing (issues #5 and #24):
+    **Phase 7 update:** Registers architecture names with ``ModelRegistry``:
 
-    * :data:`~dllm_plugin.config.LLADA2_ARCHITECTURE_NAME` — placeholder
-      until the real HF-mapped module ships (issue #12 / Phase 7).
-    * :data:`~dllm_plugin.config.DLLM_MOCK_STACK_MODEL_ID` — explicit test id.
+    * :data:`~dllm_plugin.config.LLADA2_ARCHITECTURE_NAME` — Production LLaDA2.0
+      model with MoE and block-style attention (Phase 7 / issue #12). Use
+      ``VLLM_DLLM_USE_MOCK_MODEL=1`` to override with mock for testing.
+    * :data:`~dllm_plugin.config.DLLM_MOCK_STACK_MODEL_ID` — Explicit mock model
+      for Phases 2–6 stack testing (always uses mock implementation).
+
+    Environment variables:
+    * ``VLLM_DLLM_USE_MOCK_MODEL``: If set to ``1``/``true``/``yes``/``on``,
+      registers LLADA2_ARCHITECTURE_NAME to the mock model instead of real model.
+      Useful for testing Phases 2-6 behavior with real model disabled.
 
     Uses lazy ``"<module>:<Class>"`` registration so importing this package does
     not pull ``torch``/CUDA until the model class is needed.
@@ -74,11 +79,16 @@ def register_dllm() -> None:
     helper (no-op), not by omitting the call—so with both envs set, ``apply_*``
     still runs and returns without patching.
     """
+    _logger.debug("dLLM plugin: register_dllm() called")
+
     if importlib.util.find_spec("vllm") is None:
+        _logger.debug("dLLM plugin: vllm not found, skipping registration")
         return
 
     try:
         from vllm import ModelRegistry
+
+        _logger.debug("dLLM plugin: ModelRegistry imported successfully")
     except ImportError:
         _logger.debug(
             "vllm-dllm-plugin (dllm): vLLM spec found but import failed; "
@@ -91,22 +101,73 @@ def register_dllm() -> None:
         DLLM_MOCK_MODEL_CLASS_FQCN,
         DLLM_MOCK_STACK_MODEL_ID,
         LLADA2_ARCHITECTURE_NAME,
+        LLADA2_HF_ARCHITECTURE_NAME,
+        LLADA2_REAL_MODEL_CLASS_FQCN,
     )
 
+    # Determine which model to use for LLADA2_ARCHITECTURE_NAME
+    use_mock_raw = os.environ.get("VLLM_DLLM_USE_MOCK_MODEL", "").strip().lower()
+    use_mock_model = use_mock_raw in {"1", "true", "yes", "on"}
+
+    if use_mock_model:
+        llada2_model_class = DLLM_MOCK_MODEL_CLASS_FQCN
+        _logger.info(
+            "dLLM plugin: VLLM_DLLM_USE_MOCK_MODEL=1, using mock model for %s",
+            LLADA2_ARCHITECTURE_NAME,
+        )
+    else:
+        llada2_model_class = LLADA2_REAL_MODEL_CLASS_FQCN
+        _logger.info(
+            "dLLM plugin: Using real LLaDA2.0 model for %s (Phase 7)",
+            LLADA2_ARCHITECTURE_NAME,
+        )
+
     supported = ModelRegistry.get_supported_archs()
-    for arch in (LLADA2_ARCHITECTURE_NAME, DLLM_MOCK_STACK_MODEL_ID):
-        if arch in supported:
-            _logger.debug(
-                "dLLM plugin: architecture %r already registered, skipping",
-                arch,
-            )
-            continue
-        ModelRegistry.register_model(arch, DLLM_MOCK_MODEL_CLASS_FQCN)
+
+    # Register LLADA2_ARCHITECTURE_NAME (real or mock based on env var)
+    if LLADA2_ARCHITECTURE_NAME not in supported:
+        ModelRegistry.register_model(LLADA2_ARCHITECTURE_NAME, llada2_model_class)
+        _logger.debug(
+            "dLLM plugin: registered architecture %r -> %s",
+            LLADA2_ARCHITECTURE_NAME,
+            llada2_model_class,
+        )
+    else:
+        _logger.debug(
+            "dLLM plugin: architecture %r already registered, skipping",
+            LLADA2_ARCHITECTURE_NAME,
+        )
+
+    # Register LLADA2_HF_ARCHITECTURE_NAME (HuggingFace naming convention)
+    # Points to same implementation as LLADA2_ARCHITECTURE_NAME
+    if LLADA2_HF_ARCHITECTURE_NAME not in supported:
+        ModelRegistry.register_model(LLADA2_HF_ARCHITECTURE_NAME, llada2_model_class)
+        _logger.debug(
+            "dLLM plugin: registered HF architecture %r -> %s",
+            LLADA2_HF_ARCHITECTURE_NAME,
+            llada2_model_class,
+        )
+    else:
+        _logger.debug(
+            "dLLM plugin: HF architecture %r already registered, skipping",
+            LLADA2_HF_ARCHITECTURE_NAME,
+        )
+
+    # Register DLLM_MOCK_STACK_MODEL_ID (always mock)
+    if DLLM_MOCK_STACK_MODEL_ID not in supported:
+        ModelRegistry.register_model(
+            DLLM_MOCK_STACK_MODEL_ID, DLLM_MOCK_MODEL_CLASS_FQCN
+        )
         _logger.debug(
             "dLLM plugin: registered architecture %r -> %s",
-            arch,
+            DLLM_MOCK_STACK_MODEL_ID,
             DLLM_MOCK_MODEL_CLASS_FQCN,
         )
+    else:
+        _logger.debug(
+            "dLLM plugin: architecture %r already registered, skipping",
+            DLLM_MOCK_STACK_MODEL_ID,
+        )
 
     from dllm_plugin.config import DLLM_APPLY_ENGINE_CORE_DRAFT_HOOK_ENV_VAR
 

diff --git a/dllm_plugin/attention/__init__.py b/dllm_plugin/attention/__init__.py
diff --git a/dllm_plugin/attention/virtual_batches.py b/dllm_plugin/attention/virtual_batches.py
@@ -0,0 +1,135 @@
+"""Virtual batch decomposition for block-style attention.
+
+Following vLLM's chunked_local_attention pattern, transforms CommonAttentionMetadata
+to create virtual batches for prefix and block attention chunks.
+
+Reference: vllm/model_executor/layers/attention/chunked_local_attention.py
+"""
+
+from __future__ import annotations
+
+import torch
+
+# vLLM imports (centralized in vllm_compat for version handling)
+from dllm_plugin.vllm_compat import CommonAttentionMetadata
+
+
+def make_block_attention_virtual_batches(
+    attn_metadata: CommonAttentionMetadata,
+    num_prefix_tokens: int,
+    block_size: int,
+    kv_cache_block_size: int = 16,
+) -> tuple[CommonAttentionMetadata | None, CommonAttentionMetadata]:
+    """Transform metadata for block-style dual-chunk attention.
+
+    Creates two virtual batches per request:
+    1. Prefix chunk: Q=current_block (block_size tokens), KV=prefix (num_prefix_tokens)
+    2. Block chunk: Q=current_block (block_size tokens), KV=current_block (block_size)
+
+    Each virtual batch gets its own:
+    - seq_lens: Length of KV for that chunk
+    - block_table: KV cache pages accessible to that chunk
+    - query_start_loc: Position offsets in the query tensor
+
+    Args:
+        attn_metadata: Original CommonAttentionMetadata from vLLM
+        num_prefix_tokens: Number of committed tokens (prefix length)
+        block_size: Size of current generation block (typically 32)
+        kv_cache_block_size: KV cache block size (default 16, should be
+            queried from cache_config in future)
+
+    Returns:
+        (prefix_metadata, block_metadata): Transformed metadata for each chunk
+            prefix_metadata is None if num_prefix_tokens == 0
+
+    Raises:
+        NotImplementedError: If num_reqs > 1 (multi-request batching not yet supported)
+    """
+    device = attn_metadata.query_start_loc.device
+    num_reqs = attn_metadata.num_reqs
+    total_query_tokens = attn_metadata.num_actual_tokens
+
+    # MVP limitation: Only single-request batches supported
+    # Multi-request batching with heterogeneous prefix lengths requires
+    # per-request virtual batch transformation (deferred to Phase 7.1)
+    if num_reqs > 1:
+        raise NotImplementedError(
+            "LLaDA2.0 virtual batch attention does not support multi-request "
+            "batching in this release (MVP Phase 7). Use max_num_seqs=1 or "
+            "wait for Phase 7.1 update. See docs/OPERATOR_LLaDA2.md for details."
+        )
+
+    # Edge case: First block (no prefix)
+    if num_prefix_tokens == 0:
+        # Only block self-attention, no prefix chunk
+        block_metadata = CommonAttentionMetadata(
+            query_start_loc=attn_metadata.query_start_loc,
+            query_start_loc_cpu=attn_metadata.query_start_loc_cpu,
+            seq_lens=torch.full(
+                (num_reqs,), block_size, dtype=torch.int32, device=device
+            ),
+            num_reqs=num_reqs,
+            num_actual_tokens=total_query_tokens,
+            max_query_len=block_size,
+            max_seq_len=block_size,
+            block_table_tensor=attn_metadata.block_table_tensor,
+            slot_mapping=attn_metadata.slot_mapping,
+            causal=False,  # Non-causal (bidirectional within block)
+        )
+        return None, block_metadata
+
+    # Calculate how many KV cache pages we need for prefix and block
+    # Assuming block_table has shape [num_reqs, max_num_blocks_per_seq]
+    # We need to slice it to get only the pages for prefix vs block
+
+    # Calculate blocks needed for prefix using configured KV cache block size
+    num_prefix_blocks = (
+        num_prefix_tokens + kv_cache_block_size - 1
+    ) // kv_cache_block_size
+
+    # Slice block_table for each chunk
+    prefix_block_table = attn_metadata.block_table_tensor[:, :num_prefix_blocks]
+    block_start_idx = num_prefix_blocks
+    num_block_blocks = (block_size + kv_cache_block_size - 1) // kv_cache_block_size
+    block_end_idx = block_start_idx + num_block_blocks
+    block_block_table = attn_metadata.block_table_tensor[
+        :, block_start_idx:block_end_idx
+    ]
+
+    # --- Virtual Batch 1: Prefix chunk ---
+    # Query: current block (block_size tokens)
+    # KV: prefix (num_prefix_tokens)
+
+    prefix_metadata = CommonAttentionMetadata(
+        query_start_loc=attn_metadata.query_start_loc,
+        query_start_loc_cpu=attn_metadata.query_start_loc_cpu,
+        seq_lens=torch.full(
+            (num_reqs,), num_prefix_tokens, dtype=torch.int32, device=device
+        ),
+        num_reqs=num_reqs,
+        num_actual_tokens=total_query_tokens,
+        max_query_len=block_size,
+        max_seq_len=num_prefix_tokens,
+        block_table_tensor=prefix_block_table,
+        slot_mapping=attn_metadata.slot_mapping,  # May need adjustment
+        causal=False,  # Non-causal (all queries attend to all prefix keys)
+    )
+
+    # --- Virtual Batch 2: Block chunk ---
+    # Query: current block (block_size tokens)
+    # KV: current block (block_size tokens)
+
+    block_metadata = CommonAttentionMetadata(
+        query_start_loc=attn_metadata.query_start_loc,
+        query_start_loc_cpu=attn_metadata.query_start_loc_cpu,
+        seq_lens=torch.full((num_reqs,), block_size, dtype=torch.int32, device=device),
+        num_reqs=num_reqs,
+        num_actual_tokens=total_query_tokens,
+        max_query_len=block_size,
+        max_seq_len=block_size,
+        block_table_tensor=block_block_table,
+        slot_mapping=attn_metadata.slot_mapping,  # May need adjustment
+        causal=False,  # Non-causal (bidirectional within block)
+    )
+
+    return prefix_metadata, block_metadata
diff --git a/dllm_plugin/config.py b/dllm_plugin/config.py
@@ -42,6 +42,10 @@ def _read_draft_size() -> int:
 #: Exact registry string may be refined when ``register()`` lands (issue #5).
 LLADA2_ARCHITECTURE_NAME: Final[str] = "LLaDA2ForCausalLM"
 
+#: HuggingFace architecture name used by inclusionAI/LLaDA2.0-mini model config.
+#: Registered alongside LLADA2_ARCHITECTURE_NAME to support both naming conventions.
+LLADA2_HF_ARCHITECTURE_NAME: Final[str] = "LLaDA2MoeModelLM"
+
 #: Registered model id for the **mock / stub** forward used in Phases 2-6 stack
 #: testing (deterministic outputs; see milestone issue #24).
 DLLM_MOCK_STACK_MODEL_ID: Final[str] = "DllmMockLlada2StackTest"
@@ -107,3 +111,34 @@ def resolve_strict_stack_validation(explicit: bool | None) -> bool:
 #: (zeros + ``1.0`` at index ``0``, ``docs/MOCK_STACK_MODEL.md``) commit under
 #: default settings for stack tests.
 LLADA2_DEFAULT_COMMIT_CONFIDENCE_THRESHOLD: Final[float] = 0.01
+
+# Phase 7: Real LLaDA2.0 model configuration (issue #12)
+
+#: Lazy import target for real LLaDA2.0 vLLM model (``<module>:<Class>``).
+#: Phase 7 adds production-ready model with MoE weight loading and
+#: block-style attention.
+LLADA2_REAL_MODEL_CLASS_FQCN: Final[str] = "dllm_plugin.models.llada2:LLaDA2ForCausalLM"
+
+#: Default number of experts per MoE layer (from HuggingFace LLaDA2.0 config).
+LLADA2_DEFAULT_NUM_EXPERTS: Final[int] = 256
+
+#: Default number of experts activated per token (top-k routing).
+LLADA2_DEFAULT_NUM_EXPERTS_PER_TOK: Final[int] = 8
+
+#: Default number of shared experts (always active, not routed).
+LLADA2_DEFAULT_NUM_SHARED_EXPERTS: Final[int] = 1
+
+#: Default MoE intermediate size (FFN hidden dimension per expert).
+LLADA2_DEFAULT_MOE_INTERMEDIATE_SIZE: Final[int] = 512
+
+#: Default number of expert groups for group-limited routing.
+#: LLaDA2.0 uses 8 groups for two-stage expert selection.
+LLADA2_DEFAULT_N_GROUP: Final[int] = 8
+
+#: Default number of groups to select in group-limited routing.
+#: First selects top-4 groups from 8, then top-k experts from selected groups.
+LLADA2_DEFAULT_TOPK_GROUP: Final[int] = 4
+
+#: Default scaling factor applied to routed expert output.
+#: LLaDA2.0 uses 2.5x scaling on routed experts before adding shared expert output.
+LLADA2_DEFAULT_ROUTED_SCALING_FACTOR: Final[float] = 2.5