|
11 | 11 |
|
12 | 12 | import torch |
13 | 13 |
|
| 14 | +from tensorrt_llm._torch.expert_statistic import ExpertStatistic |
14 | 15 | from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds |
15 | 16 |
|
16 | 17 | try: |
@@ -136,6 +137,7 @@ def __init__(self, |
136 | 137 |
|
137 | 138 | self.peft_cache_config = peft_cache_config |
138 | 139 |
|
| 140 | + self.iter_counter = 0 |
139 | 141 | # profile config |
140 | 142 | self.profile_start_iters, self.profile_stop_iters = _load_iteration_indexes( |
141 | 143 | PROFILE_START_STOP_ENV_VAR_NAME) |
@@ -575,7 +577,7 @@ def profile_step(): |
575 | 577 | formatted_timestamp = datetime.datetime.now().strftime( |
576 | 578 | "%Y-%m-%d %H:%M:%S") |
577 | 579 | logger.info( |
578 | | - f"iter = {self.model_engine.iter_counter}, " |
| 580 | + f"iter = {self.iter_counter}, " |
579 | 581 | f"global_rank = {self.global_rank}, " |
580 | 582 | f"rank = {self.dist.rank}, " |
581 | 583 | f"currank_total_requests = {self.executor_request_queue.num_fetch_requests_cur_rank}/" |
@@ -705,7 +707,7 @@ def _update_iter_stats(self, stats, iter_latency_ms, num_completed_requests, |
705 | 707 | stats.cpu_mem_usage = 0 |
706 | 708 | stats.pinned_mem_usage = 0 |
707 | 709 |
|
708 | | - stats.iter = self.model_engine.iter_counter |
| 710 | + stats.iter = self.iter_counter |
709 | 711 |
|
710 | 712 | kv_cache_manager = self.resource_manager.resource_managers.get( |
711 | 713 | ResourceManagerType.KV_CACHE_MANAGER) |
@@ -1004,6 +1006,8 @@ def _executor_loop_pp(self): |
1004 | 1006 | self.active_requests, |
1005 | 1007 | previous_batch) |
1006 | 1008 |
|
| 1009 | + self.iter_counter += 1 |
| 1010 | + |
1007 | 1011 | def wait_on_pp_send_handles(self, microbatch_id): |
1008 | 1012 | if self.send_handles[microbatch_id] is not None: |
1009 | 1013 | self.send_handles[microbatch_id].wait() |
@@ -1244,6 +1248,8 @@ def _executor_loop(self): |
1244 | 1248 | iter_stats=iter_stats, |
1245 | 1249 | iter_start_time=iter_start_time)) |
1246 | 1250 |
|
| 1251 | + self.iter_counter += 1 |
| 1252 | + |
1247 | 1253 | def _prepare_draft_requests(self): |
1248 | 1254 | try: |
1249 | 1255 | # Set draft tokens here to make the KV cache manager |
@@ -1472,6 +1478,8 @@ def _executor_loop_overlap(self): |
1472 | 1478 |
|
1473 | 1479 | self._kv_connector_terminate_requests() |
1474 | 1480 |
|
| 1481 | + self.iter_counter += 1 |
| 1482 | + |
1475 | 1483 | def _process_previous_batch(self): |
1476 | 1484 | if self.kv_cache_transceiver and self.previous_batch.ctx_transmission_reqs: |
1477 | 1485 | for req in self.previous_batch.ctx_transmission_reqs: |
@@ -1875,9 +1883,10 @@ def _check_disagg_gen_cache_transfer_status(self, atLeastNum: int = 0): |
1875 | 1883 | def _forward_step(self, |
1876 | 1884 | scheduled_requests, |
1877 | 1885 | new_tensors_device: Optional[SampleStateTensors] = None): |
| 1886 | + ExpertStatistic.set_iter(self.iter_counter) |
1878 | 1887 |
|
1879 | 1888 | @nvtx_range( |
1880 | | - f"[Executor] _forward_step {self.model_engine.iter_counter + 1}: {len(scheduled_requests.context_requests)} ctx reqs, {len(scheduled_requests.generation_requests)} gen reqs" |
| 1889 | + f"[Executor] _forward_step {self.iter_counter}: {len(scheduled_requests.context_requests)} ctx reqs, {len(scheduled_requests.generation_requests)} gen reqs" |
1881 | 1890 | ) |
1882 | 1891 | def forward(scheduled_requests, resource_manager, new_tensors_device, |
1883 | 1892 | gather_context_logits, cache_indirection_buffer): |
@@ -2215,7 +2224,7 @@ def _handle_responses(self): |
2215 | 2224 |
|
2216 | 2225 | # Skip active requests that are not scheduled |
2217 | 2226 | if request.return_perf_metrics and request.py_decoding_iter >= 1: |
2218 | | - request.update_perf_metrics(self.model_engine.iter_counter) |
| 2227 | + request.update_perf_metrics(self.iter_counter) |
2219 | 2228 |
|
2220 | 2229 | request_done = False |
2221 | 2230 | if request.py_decoding_iter == 1 or request.is_finished or \ |
|
0 commit comments