From a1cd3d432cbb93fe021e780e13333ea8532215ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sophie=20du=20Cou=C3=A9dic?= Date: Thu, 24 Jul 2025 12:19:40 +0000 Subject: [PATCH] fix scheduler free prefix token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sophie du Couédic --- vllm_spyre/v1/core/scheduler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py index d1b6ef63f..5bc145770 100644 --- a/vllm_spyre/v1/core/scheduler.py +++ b/vllm_spyre/v1/core/scheduler.py @@ -224,8 +224,9 @@ def can_schedule(self, request) -> bool: cond2 = len(self.waiting) < max_prompt_batch_size # check that the prompt length does not exceed the current tkv cond3 = request.num_prompt_tokens <= self.tkv - # check that the number of requested tokens can be served - cond4 = request.max_tokens <= (max_context_len - self.tkv) + # check that the number of requested tokens can be served (-1 for free + # prefill token) + cond4 = request.max_tokens - 1 <= (max_context_len - self.tkv) # check that there are enough free blocks/pages remaining # Note: we only have to do check in case of a running batches # (not start_new_batch), because the minimal number of blocks covers