Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions vllm_spyre/v1/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,11 @@ def __init__(self, *args, **kwargs) -> None:
# cache for self.check_batch_tkv_limit() outer key: tuple(request_ids),
# inner key: (request_id, max_batch_tkv_limit), value: (lower, upper)
self._cache_check_batch_tkv_limit: dict[tuple, dict[tuple, tuple]] = {}
# Consecutive prefill operations are interleaved with a decode step to
# minimize interruptions of current running requests. This mitigates the
# peaks in inter-token latency (ITL). A prefill is skipped if the
# previous step was also a prefill.
self.previous_step_was_prefill: bool = False

def update_from_output(
self,
Expand Down Expand Up @@ -199,12 +204,14 @@ def schedule(self) -> "SchedulerOutput":
# Schedule Prefill and Decode separately
if len(self.waiting) > 0:
# For prefill, hide current decodes from the scheduler
self.previous_step_was_prefill = True
running_holdback = self.running
self.running = []
logger.debug(
"Scheduling a prefill step of %d requests, holding back %d "
"requests", len(self.waiting), len(holdback_queue))
else:
self.previous_step_was_prefill = False
running_holdback = []
logger.debug("Scheduling a decode step of %d requests",
len(self.running))
Expand All @@ -223,6 +230,10 @@ def can_schedule(self, request) -> bool:
max_prompt_batch_size = 1
max_context_len = self.scheduler_config.max_model_len

# two consecutive prefill steps are now allowed
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I guess you mean not allowed, not now allowed in the comment

if self.previous_step_was_prefill:
return False

# running and waiting queues are both empty -> start a new batch
# which can always be scheduled
if len(self.running) + len(self.waiting) == 0:
Expand Down
Loading