@@ -853,7 +853,7 @@ def _executor_loop_pp(self):
853853 self .num_scheduled_requests = scheduled_batch .batch_size
854854
855855 logger .debug (
856- f'has { len (self .active_requests )} active_request , '
856+ f'has { len (self .active_requests )} active_requests , '
857857 f'scheduled { len (scheduled_batch .context_requests )} context requests and '
858858 f'{ len (scheduled_batch .generation_requests )} generation requests'
859859 )
@@ -1094,7 +1094,7 @@ def _prepare_and_schedule_batch(self):
10941094
10951095 self .num_scheduled_requests = scheduled_batch .batch_size
10961096 logger .debug (
1097- f'has { len (self .active_requests )} active_request , '
1097+ f'has { len (self .active_requests )} active_requests , '
10981098 f'scheduled { len (scheduled_batch .context_requests )} context requests and '
10991099 f'{ len (scheduled_batch .generation_requests )} generation requests' )
11001100 return scheduled_batch , iter_stats
@@ -1374,21 +1374,22 @@ def _executor_loop_overlap(self):
13741374 if target_inputs is not None :
13751375 self ._process_draft_results (scheduled_batch ,
13761376 draft_outputs , draft_batch )
1377- elif self .previous_batch is not None and not use_previous_draft_tokens :
1378- self ._update_requests (self .previous_batch .sample_state )
1377+ if target_inputs is None and self .previous_batch is not None and not use_previous_draft_tokens :
1378+ self ._update_requests (self .previous_batch .sample_state )
13791379
1380- if self .block_reuse_enabled and not self .kv_cache_manager .is_vswa and self .kv_cache_transceiver :
1381- for req in self .previous_batch .sample_state .scheduled_requests .context_requests :
1382- if req .is_context_only_request and (
1383- req .is_context_finished
1384- or req .is_finished_due_to_length ):
1385- block_id = self .kv_cache_manager .store_blocks_for_reuse (
1386- req , True )
1387- self .ctx_in_transmission_requests [
1388- req .py_request_id ] = (
1389- (req , block_id ,
1390- self .ctx_in_transmission_counter ))
1380+ if self .block_reuse_enabled and not self .kv_cache_manager .is_vswa and self .kv_cache_transceiver :
1381+ for req in self .previous_batch .sample_state .scheduled_requests .context_requests :
1382+ if req .is_context_only_request and (
1383+ req .is_context_finished
1384+ or req .is_finished_due_to_length ):
1385+ block_id = self .kv_cache_manager .store_blocks_for_reuse (
1386+ req , True )
1387+ self .ctx_in_transmission_requests [
1388+ req .py_request_id ] = (
1389+ (req , block_id ,
1390+ self .ctx_in_transmission_counter ))
13911391
1392+ if scheduled_batch .batch_size > 0 :
13921393 if self .guided_decoder is not None :
13931394 # add_batch must be called again to have updated new tokens.
13941395 self .guided_decoder .add_batch (scheduled_batch )
@@ -1404,9 +1405,10 @@ def _executor_loop_overlap(self):
14041405 scheduled_batch .context_requests
14051406 ) if self .kv_cache_transceiver else []
14061407
1407- if self .previous_batch is not None :
1408- self ._process_previous_batch ()
1408+ if self .previous_batch is not None :
1409+ self ._process_previous_batch ()
14091410
1411+ if scheduled_batch .batch_size > 0 :
14101412 if self .enable_iter_perf_stats :
14111413 iter_stats .inflight_batching_stats .num_ctx_tokens = self .model_engine .iter_states [
14121414 'num_ctx_tokens' ]
@@ -1879,7 +1881,17 @@ def _update_request_states_tp(self, scheduled_requests: ScheduledRequests):
18791881 request .context_chunk_size )
18801882 request .move_to_next_context_chunk ()
18811883 if request .context_remaining_length == 0 :
1882- request .state = LlmRequestState .GENERATION_IN_PROGRESS
1884+ if not self .disable_overlap_scheduler and request .will_complete_next_iteration (
1885+ ):
1886+ request .state = LlmRequestState .GENERATION_TO_COMPLETE
1887+ else :
1888+ request .state = LlmRequestState .GENERATION_IN_PROGRESS
1889+
1890+ for request in scheduled_requests .generation_requests :
1891+ if request .state != LlmRequestState .GENERATION_COMPLETE :
1892+ if not self .disable_overlap_scheduler and request .will_complete_next_iteration (
1893+ ):
1894+ request .state = LlmRequestState .GENERATION_TO_COMPLETE
18831895
18841896 def _update_request_states_star_attention (
18851897 self , scheduled_requests : ScheduledRequests ):
0 commit comments