yukavio · yukavio · Oct 13, 2024 · Oct 14, 2024 · Oct 15, 2024 · Oct 16, 2024
diff --git a/examples/runtime/engine/offline_batch_inference.py b/examples/runtime/engine/offline_batch_inference.py
@@ -1,21 +1,23 @@
 import sglang as sgl
-
+import time
 
 def main():
     # Sample prompts.
     prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
+        "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Where is the capital city of France? ASSISTANT:",
+        "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: 北京今天天气怎么样？ ASSISTANT:"
     ]
     # Create a sampling params object.
-    sampling_params = {"temperature": 0.8, "top_p": 0.95}
+    sampling_params = {"temperature": 0, "max_new_tokens": 30}
 
-    # Create an LLM.
-    llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
 
+    # Create an LLM.
+    llm = sgl.Engine(model_path="Llama-2-7b-chat-hf", draft_model_path='EAGLE-llama2-chat-7B', disable_cuda_graph=True, num_speculative_steps=5, eagle_topk=8, num_draft_tokens=64, speculative_algorithm='EAGLE', mem_fraction_static=0.60)
+    #llm = sgl.Engine(model_path="Llama-2-7b-chat-hf", disable_cuda_graph=False)
+    #outputs = llm.generate(prompts, sampling_params)
+    start = time.time()
     outputs = llm.generate(prompts, sampling_params)
+    print(time.time()-start)
     # Print the outputs.
     for prompt, output in zip(prompts, outputs):
         print("===============================")

diff --git a/python/sglang/srt/layers/attention/__init__.py b/python/sglang/srt/layers/attention/__init__.py
@@ -23,20 +23,25 @@ def init_cuda_graph_state(self, max_bs: int):
     def init_forward_metadata_capture_cuda_graph(
         self,
         bs: int,
+        num_token: int,
         req_pool_indices: torch.Tensor,
         seq_lens: torch.Tensor,
-        encoder_lens: Optional[torch.Tensor] = None,
+        encoder_lens: torch.Tensor = None,
+        spec_info=None,
+        is_draft_runner=False,
     ):
         """Init the metadata for a forward pass for capturing a cuda graph."""
         raise NotImplementedError()
 
     def init_forward_metadata_replay_cuda_graph(
         self,
         bs: int,
+        num_token: int,
         req_pool_indices: torch.Tensor,
         seq_lens: torch.Tensor,
         seq_lens_sum: int,
-        encoder_lens: Optional[torch.Tensor] = None,
+        encoder_lens=None,
+        spec_info=None,
     ):
         """Init the metadata for a forward pass for replying a cuda graph."""
         raise NotImplementedError()
@@ -54,7 +59,9 @@ def forward(
         forward_batch: ForwardBatch,
     ):
         """Run forward on an attention layer."""
-        if forward_batch.forward_mode.is_decode():
+        if forward_batch.forward_mode.is_verify():
+            return self.forward_extend(q, k, v, layer, forward_batch)
+        elif forward_batch.forward_mode.is_decode():
             return self.forward_decode(q, k, v, layer, forward_batch)
         else:
             return self.forward_extend(q, k, v, layer, forward_batch)