From a1cd3d432cbb93fe021e780e13333ea8532215ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophie=20du=20Cou=C3=A9dic?= <sop@zurich.ibm.com>
Date: Thu, 24 Jul 2025 12:19:40 +0000
Subject: [PATCH] fix scheduler free prefix token
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Sophie du Couédic <sop@zurich.ibm.com>
---
 vllm_spyre/v1/core/scheduler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py
index d1b6ef63f..5bc145770 100644
--- a/vllm_spyre/v1/core/scheduler.py
+++ b/vllm_spyre/v1/core/scheduler.py
@@ -224,8 +224,9 @@ def can_schedule(self, request) -> bool:
         cond2 = len(self.waiting) < max_prompt_batch_size
         # check that the prompt length does not exceed the current tkv
         cond3 = request.num_prompt_tokens <= self.tkv
-        # check that the number of requested tokens can be served
-        cond4 = request.max_tokens <= (max_context_len - self.tkv)
+        # check that the number of requested tokens can be served (-1 for free
+        # prefill token)
+        cond4 = request.max_tokens - 1 <= (max_context_len - self.tkv)
         # check that there are enough free blocks/pages remaining
         # Note: we only have to do check in case of a running batches
         # (not start_new_batch), because the minimal number of blocks covers