diff --git a/atom/model_engine/block_manager.py b/atom/model_engine/block_manager.py index 4ea5c1494..d6f429865 100644 --- a/atom/model_engine/block_manager.py +++ b/atom/model_engine/block_manager.py @@ -146,7 +146,12 @@ def can_allocate(self, seq: Sequence) -> int: # blocks. See `allocate()` for the budget reasoning. if seq.has_per_req_cache and not self.free_per_req_cache_groups: return -1 - if not self.enable_prefix_caching: + # Multimodal sequences bypass prefix caching: image-placeholder tokens + # all share a single id, so token-id hashing collides across different + # images and would reuse the wrong image's KV (vision-embeds vs + # placeholder count mismatch -> runner crash). ATOM has no mm-aware + # cache hashing yet, so simply don't prefix-cache multimodal seqs. + if not self.enable_prefix_caching or seq.multimodal_data is not None: if len(self.free_block_ids_set) < seq.num_blocks: return -1 return 0 @@ -224,7 +229,7 @@ def hash_blocks(self, seq: Sequence, num_new_tokens: int) -> None: single-shot prefill that's `seq.num_tokens - seq.num_cached_tokens`; chunked prefill will pass the per-chunk count. """ - if not self.enable_prefix_caching: + if not self.enable_prefix_caching or seq.multimodal_data is not None: return start = seq.num_cached_tokens // self.block_size end = (seq.num_cached_tokens + num_new_tokens) // self.block_size