-
Notifications
You must be signed in to change notification settings - Fork 6.4k
Add graph runner support with torch compile on CPU #7843
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 28 commits
68a3c1d
f8d5ab2
21a485f
954f5ab
7449778
2e0e9a7
d7917fe
f317bf7
513a2ef
20f3fbc
1d23804
d7264d8
d3643ed
a84d088
28e5047
d5736f3
82981c4
ce41f84
65ea131
6f7d22e
159a103
3a52dcc
d6109c5
1c67857
d04bdf2
a4066c4
9dedc87
fc4118a
b99506f
3f2f32a
fea6858
8f02f28
fdfa7f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -181,7 +181,7 @@ class GenerationBatchResult: | |
| extend_input_len_per_req: List[int] | ||
| extend_logprob_start_len_per_req: List[int] | ||
| bid: int | ||
| can_run_cuda_graph: bool | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we keep the old name
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changing
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I hope to minimize the impact on downstream forks as much as possible, usually new hardware changes are best made independently, with minimal changes to existing NVIDIA GPUs.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your comments. I'll change them back.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modified. |
||
| can_run_graph: bool | ||
|
|
||
|
|
||
| @dataclass | ||
|
|
@@ -398,7 +398,7 @@ def __init__( | |
| f"max_prefill_tokens={self.max_prefill_tokens}, " | ||
| f"max_running_requests={self.max_running_requests}, " | ||
| f"context_len={self.model_config.context_len}, " | ||
| f"available_gpu_mem={avail_mem:.2f} GB" | ||
| f"{'available_cpu_mem' if self.device == 'cpu' else 'available_gpu_mem'}={avail_mem:.2f} GB" | ||
| ) | ||
|
|
||
| # Init memory pool and cache | ||
|
|
@@ -929,7 +929,7 @@ def event_loop_pp(self): | |
| "extend_logprob_start_len_per_req", None | ||
| ), | ||
| bid=bids[next_mb_id], | ||
| can_run_cuda_graph=result.can_run_cuda_graph, | ||
| can_run_graph=result.can_run_graph, | ||
| ) | ||
| self.process_batch_result(mbs[next_mb_id], output_result) | ||
| last_mbs[next_mb_id] = mbs[next_mb_id] | ||
|
|
@@ -1778,11 +1778,11 @@ def run_batch( | |
| model_worker_batch.hicache_consumer_index | ||
| ) | ||
| if self.pp_group.is_last_rank: | ||
| logits_output, next_token_ids, can_run_cuda_graph = ( | ||
| logits_output, next_token_ids, can_run_graph = ( | ||
| self.tp_worker.forward_batch_generation(model_worker_batch) | ||
| ) | ||
| else: | ||
| pp_hidden_states_proxy_tensors, _, can_run_cuda_graph = ( | ||
| pp_hidden_states_proxy_tensors, _, can_run_graph = ( | ||
| self.tp_worker.forward_batch_generation(model_worker_batch) | ||
| ) | ||
| bid = model_worker_batch.bid | ||
|
|
@@ -1792,7 +1792,7 @@ def run_batch( | |
| next_token_ids, | ||
| bid, | ||
| num_accepted_tokens, | ||
| can_run_cuda_graph, | ||
| can_run_graph, | ||
| ) = self.draft_worker.forward_batch_speculative_generation(batch) | ||
| bs = batch.batch_size() | ||
| self.spec_num_total_accepted_tokens += num_accepted_tokens + bs | ||
|
|
@@ -1827,7 +1827,7 @@ def run_batch( | |
| extend_input_len_per_req=extend_input_len_per_req, | ||
| extend_logprob_start_len_per_req=extend_logprob_start_len_per_req, | ||
| bid=bid, | ||
| can_run_cuda_graph=can_run_cuda_graph, | ||
| can_run_graph=can_run_graph, | ||
| ) | ||
| else: # embedding or reward model | ||
| model_worker_batch = batch.get_model_worker_batch() | ||
|
|
@@ -2288,10 +2288,9 @@ def get_internal_state(self, recv_req: GetInternalStateReq): | |
| "token_capacity": int(self.max_total_num_tokens), | ||
| } | ||
|
|
||
| if not _is_cpu: | ||
| ret["memory_usage"]["cuda_graph"] = round( | ||
| self.tp_worker.worker.model_runner.cuda_graph_mem_usage, 2 | ||
| ) | ||
| ret["memory_usage"]["graph"] = round( | ||
| self.tp_worker.worker.model_runner.graph_mem_usage, 2 | ||
| ) | ||
|
|
||
| if not self.spec_algorithm.is_none() and self.cum_spec_accept_count > 0: | ||
| ret["avg_spec_accept_length"] = ( | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.