[fix] Generation logits length for overlap scheduler early exit

Funatiq · Funatiq · commit cd8f8a8d1f10 · 2025-10-27T15:52:00.000Z
Signed-off-by: Robin Kobus &lt;19427718+Funatiq@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -43,19 +43,19 @@ class LogitsStorage:
 
     def __init__(
         self,
+        *,
         seq_length: int,
         use_device_memory=True,
-        should_exclude_last=False,
+        extra_token_for_overlap_scheduler=False,
         use_chunked_generation_logits=False,
         chunk_size=8
     ):  # logic adpted from HandleGenerationLogits.cpp to use chunked transfer
-        if should_exclude_last:
+        if extra_token_for_overlap_scheduler:
             # Exclude last logits is used when overlap scheduler is used, that generates one extra token,
             # so we should make sure there's memory for that extra +1.
             seq_length += 1
         self.seq_length = seq_length
         self.use_device_memory = use_device_memory
-        self._should_exclude_last = should_exclude_last
         self.use_chunked_generation_logits = use_chunked_generation_logits
         self.chunk_size = chunk_size
         self._logits_indices = []
@@ -126,14 +126,14 @@ def append(self, logits: torch.Tensor):
                                                        non_blocking=True)
             self._logits_indices.append((position, new_position))
 
-    def get(self, all_logits: bool) -> torch.Tensor | None:
+    def get(self, all_logits: bool, exclude_last: bool) -> torch.Tensor | None:
         """Returns the used logits storage if there are any, otherwise, returns None.
         When all_logits is True then all set logits are returned, otherwise, only the last logits are returned."""
         if self._storage is None:
             return None
 
         try:
-            last = -2 if self._should_exclude_last else -1
+            last = -2 if exclude_last else -1
             start = 0 if all_logits else self._logits_indices[last][0]
             end = self._logits_indices[last][1]
             return self._storage[start:end]
@@ -175,9 +175,6 @@ def finalize_chunked_transfer(self):
         if self.use_chunked_generation_logits and self._device_fragments:
             self._transfer_chunk_to_host()
 
-    def set_exclude_last(self, should_exclude_last: bool) -> None:
-        self._should_exclude_last = should_exclude_last
-
 
 class LogProbStorage:
     beam_width: int = -1
@@ -225,6 +222,7 @@ class PyResult:
     """PyResult reimplements some features of `bindings.executor.Result` in Python"""
 
     def __init__(self,
+                 *,
                  prompt_len: int,
                  max_new_tokens: int,
                  use_device_memory=True,
@@ -240,16 +238,20 @@ def __init__(self,
             assert chunk_size == 1, "chunk_size must be 1 in streaming mode"
         self._streaming = streaming
         self._chunk_size = chunk_size
+        self._exclude_last_generation_logits = exclude_last_generation_logits
 
         # Note that in C++ implemnetation both context logits and generation logits are stored on host memory.
         # Here we only use host memory for generation logits if in chunked model.
         self._context_logits = LogitsStorage(
-            prompt_len, use_device_memory, use_chunked_generation_logits=False
+            seq_length=prompt_len,
+            use_device_memory=use_device_memory,
+            extra_token_for_overlap_scheduler=False,
+            use_chunked_generation_logits=False
         ) if return_context_logits else None
         self._generation_logits = LogitsStorage(
-            max_new_tokens,
-            use_device_memory,
-            exclude_last_generation_logits,
+            seq_length=max_new_tokens,
+            use_device_memory=use_device_memory,
+            extra_token_for_overlap_scheduler=exclude_last_generation_logits,
             use_chunked_generation_logits=use_chunked_generation_logits,
             chunk_size=self._chunk_size) if return_generation_logits else None
         self._log_probs = LogProbStorage() if return_log_probs else None
@@ -263,6 +265,10 @@ def __init__(self,
             for name in additional_outputs
         } if additional_outputs else None
 
+    def set_exclude_last_generation_logits(
+            self, exclude_last_generation_logits: bool):
+        self._exclude_last_generation_logits = exclude_last_generation_logits
+
     def append_context_logits(self, context_logits: torch.Tensor):
         if self._context_logits:
             self._context_logits.append(context_logits)
@@ -309,7 +315,7 @@ def set_log_probs(self, log_probs: list[TokenLogprobs],
     @property
     def context_logits(self) -> torch.Tensor | None:
         if self._context_logits is None or (storage := self._context_logits.get(
-                all_logits=True)) is None:
+                all_logits=True, exclude_last=False)) is None:
             return None
         return storage[:, 0]  # remove beam_width axis for context
 
@@ -320,7 +326,9 @@ def generation_logits(self) -> torch.Tensor | None:
         if not self._generation_logits:
             return None
 
-        storage = self._generation_logits.get(all_logits=not self._streaming)
+        storage = self._generation_logits.get(
+            all_logits=not self._streaming,
+            exclude_last=self._exclude_last_generation_logits)
         if storage is None:
             return None
         return storage.transpose(0, 1)
@@ -522,14 +530,14 @@ def __init__(
         self.py_stop_words_list = stop_words_list
 
         self.py_result = PyResult(
-            self.py_prompt_len,
-            self.py_max_new_tokens,
-            return_logits_device_memory,
-            self.streaming,
-            return_log_probs,
-            return_context_logits,
-            return_generation_logits,
-            exclude_last_generation_logits,
+            prompt_len=self.py_prompt_len,
+            max_new_tokens=self.py_max_new_tokens,
+            use_device_memory=return_logits_device_memory,
+            streaming=self.streaming,
+            return_log_probs=return_log_probs,
+            return_context_logits=return_context_logits,
+            return_generation_logits=return_generation_logits,
+            exclude_last_generation_logits=exclude_last_generation_logits,
             use_chunked_generation_logits=self.py_use_chunked_generation_logits,
             chunk_size=self.py_logits_chunk_size,
             additional_outputs=additional_outputs)
@@ -543,6 +551,11 @@ def __init__(
             else:
                 self._py_embedding_bias_1d = self.embedding_bias
 
+    def set_exclude_last_generation_logits(
+            self, exclude_last_generation_logits: bool):
+        self.py_result.set_exclude_last_generation_logits(
+            exclude_last_generation_logits)
+
     @property
     def cached_tokens(self) -> int:
         return self._cached_tokens
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1883,6 +1883,7 @@ def _update_request_states_tp(self, scheduled_requests: ScheduledRequests):
             if request.context_remaining_length == 0:
                 if not self.disable_overlap_scheduler and request.will_complete_next_iteration(
                 ):
+                    request.set_exclude_last_generation_logits(False)
                     request.state = LlmRequestState.GENERATION_TO_COMPLETE
                 else:
                     request.state = LlmRequestState.GENERATION_IN_PROGRESS
@@ -1891,6 +1892,7 @@ def _update_request_states_tp(self, scheduled_requests: ScheduledRequests):
             if request.state != LlmRequestState.GENERATION_COMPLETE:
                 if not self.disable_overlap_scheduler and request.will_complete_next_iteration(
                 ):
+                    request.set_exclude_last_generation_logits(False)
                     request.state = LlmRequestState.GENERATION_TO_COMPLETE
 
     def _update_request_states_star_attention(