|
11 | 11 |
|
12 | 12 | import torch |
13 | 13 |
|
| 14 | +from tensorrt_llm._torch.expert_statistic import ExpertStatistic |
14 | 15 | from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds |
15 | 16 |
|
16 | 17 | try: |
@@ -135,6 +136,7 @@ def __init__(self, |
135 | 136 |
|
136 | 137 | self.peft_cache_config = peft_cache_config |
137 | 138 |
|
| 139 | + self.iter_counter = 0 |
138 | 140 | # profile config |
139 | 141 | self.profile_start_iters, self.profile_stop_iters = _load_iteration_indexes( |
140 | 142 | PROFILE_START_STOP_ENV_VAR_NAME) |
@@ -567,7 +569,7 @@ def profile_step(): |
567 | 569 | formatted_timestamp = datetime.datetime.now().strftime( |
568 | 570 | "%Y-%m-%d %H:%M:%S") |
569 | 571 | logger.info( |
570 | | - f"iter = {self.model_engine.iter_counter}, " |
| 572 | + f"iter = {self.iter_counter}, " |
571 | 573 | f"global_rank = {self.global_rank}, " |
572 | 574 | f"rank = {self.dist.rank}, " |
573 | 575 | f"currank_total_requests = {self.executor_request_queue.num_fetch_requests_cur_rank}/" |
@@ -697,7 +699,7 @@ def _update_iter_stats(self, stats, iter_latency_ms, num_completed_requests, |
697 | 699 | stats.cpu_mem_usage = 0 |
698 | 700 | stats.pinned_mem_usage = 0 |
699 | 701 |
|
700 | | - stats.iter = self.model_engine.iter_counter |
| 702 | + stats.iter = self.iter_counter |
701 | 703 |
|
702 | 704 | kv_cache_manager = self.resource_manager.resource_managers.get( |
703 | 705 | ResourceManagerType.KV_CACHE_MANAGER) |
@@ -994,6 +996,8 @@ def _executor_loop_pp(self): |
994 | 996 | self.active_requests, |
995 | 997 | previous_batch) |
996 | 998 |
|
| 999 | + self.iter_counter += 1 |
| 1000 | + |
997 | 1001 | def wait_on_pp_send_handles(self, microbatch_id): |
998 | 1002 | if self.send_handles[microbatch_id] is not None: |
999 | 1003 | self.send_handles[microbatch_id].wait() |
@@ -1232,6 +1236,8 @@ def _executor_loop(self): |
1232 | 1236 | iter_stats=iter_stats, |
1233 | 1237 | iter_start_time=iter_start_time)) |
1234 | 1238 |
|
| 1239 | + self.iter_counter += 1 |
| 1240 | + |
1235 | 1241 | def _prepare_draft_requests(self): |
1236 | 1242 | try: |
1237 | 1243 | # Set draft tokens here to make the KV cache manager |
@@ -1417,6 +1423,8 @@ def _executor_loop_overlap(self): |
1417 | 1423 |
|
1418 | 1424 | self._kv_connector_terminate_requests() |
1419 | 1425 |
|
| 1426 | + self.iter_counter += 1 |
| 1427 | + |
1420 | 1428 | def _process_previous_batch(self): |
1421 | 1429 | if self.kv_cache_transceiver and self.previous_batch.ctx_transmission_reqs: |
1422 | 1430 | for req in self.previous_batch.ctx_transmission_reqs: |
@@ -1820,9 +1828,10 @@ def _check_disagg_gen_cache_transfer_status(self, atLeastNum: int = 0): |
1820 | 1828 | def _forward_step(self, |
1821 | 1829 | scheduled_requests, |
1822 | 1830 | new_tensors_device: Optional[SampleStateTensors] = None): |
| 1831 | + ExpertStatistic.set_iter(self.iter_counter) |
1823 | 1832 |
|
1824 | 1833 | @nvtx_range( |
1825 | | - f"[Executor] _forward_step {self.model_engine.iter_counter + 1}: {len(scheduled_requests.context_requests)} ctx reqs, {len(scheduled_requests.generation_requests)} gen reqs" |
| 1834 | + f"[Executor] _forward_step {self.iter_counter}: {len(scheduled_requests.context_requests)} ctx reqs, {len(scheduled_requests.generation_requests)} gen reqs" |
1826 | 1835 | ) |
1827 | 1836 | def forward(scheduled_requests, resource_manager, new_tensors_device, |
1828 | 1837 | gather_context_logits, cache_indirection_buffer): |
@@ -2160,7 +2169,7 @@ def _handle_responses(self): |
2160 | 2169 |
|
2161 | 2170 | # Skip active requests that are not scheduled |
2162 | 2171 | if request.return_perf_metrics and request.py_decoding_iter >= 1: |
2163 | | - request.update_perf_metrics(self.model_engine.iter_counter) |
| 2172 | + request.update_perf_metrics(self.iter_counter) |
2164 | 2173 |
|
2165 | 2174 | request_done = False |
2166 | 2175 | if request.py_decoding_iter == 1 or request.is_finished or \ |
|
0 commit comments