sgl-project
diff --git a/‎.github/workflows/release-pypi-router.yml
+5 b/‎.github/workflows/release-pypi-router.yml
+5
diff --git a/‎benchmark/multi_turn_chat/long_prompt_multi_turn.py
+42-11 b/‎benchmark/multi_turn_chat/long_prompt_multi_turn.py
+42-11
diff --git a/‎python/pyproject.toml
+5 b/‎python/pyproject.toml
+5
diff --git a/‎python/sglang/bench_one_batch.py
+1-4 b/‎python/sglang/bench_one_batch.py
+1-4
diff --git a/‎python/sglang/bench_serving.py
+39-3 b/‎python/sglang/bench_serving.py
+39-3
@@ -3,6 +3,11 @@
 name: Release SGLang Router to PyPI
 
 on:
+  push:
+    branches:
+      - main
+    paths:
+      - rust/pyproject.toml
   workflow_dispatch:
 
 jobs:
 
@@ -1,21 +1,24 @@
 import itertools
 import json
+import os
 import random
 import string
 import threading
 import time
 from argparse import ArgumentParser
+from pathlib import Path
+from typing import Union
+
+from tqdm import tqdm
 
 import sglang as sgl
-from sglang.srt.hf_transformers_utils import get_tokenize
+from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import (
     add_common_sglang_args_and_parse,
     select_sglang_backend,
 )
 from sglang.utils import dump_state_text
 
-random.seed(42)
-
 
 def gen_prompt(tokenizer, token_num):
     all_available_tokens = list(tokenizer.get_vocab().values())
@@ -24,12 +27,34 @@ def gen_prompt(tokenizer, token_num):
     return ret
 
 
+def get_cache_path(args):
+    # Create cache directory under ~/.cache/sglang
+    cache_dir = Path.home() / ".cache" / "sglang"
+
+    # Create a unique cache filename based on the arguments that affect generation
+    cache_key = f"qa_{args.num_qa}_{args.turns}_{args.system_prompt_len}_{args.len_q}_{args.len_a}_{args.tokenizer.replace('/', '_')}.json"
+    return cache_dir / cache_key
+
+
 def gen_arguments(args, tokenizer):
-    multi_qas = [
-        {"system_prompt": gen_prompt(tokenizer, args.system_prompt_len), "qas": []}
-        for _ in range(args.num_qa)
-    ]
-    for i in range(args.num_qa):
+    cache_path = get_cache_path(args)
+
+    # Try to load from cache first
+    if cache_path.exists():
+        print(f"Loading cached arguments from {cache_path}")
+        with open(cache_path, "r") as f:
+            return json.load(f)
+
+    print("Generating new arguments...")
+    # First progress bar for system prompts
+    multi_qas = []
+    for _ in tqdm(range(args.num_qa), desc="Generating system prompts"):
+        multi_qas.append(
+            {"system_prompt": gen_prompt(tokenizer, args.system_prompt_len), "qas": []}
+        )
+
+    # Nested progress bars for QA pairs
+    for i in tqdm(range(args.num_qa), desc="Generating QA pairs"):
         qas = multi_qas[i]["qas"]
         for j in range(args.turns):
             qas.append(
@@ -38,14 +63,21 @@ def gen_arguments(args, tokenizer):
                     "new_tokens": args.len_a,
                 }
             )
+
+    # Save to cache
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(cache_path, "w") as f:
+        json.dump(multi_qas, f)
+    print(f"Cached arguments saved to {cache_path}")
+
     return multi_qas
 
 
 @sgl.function
 def multi_turns(s, system_prompt, qas):
     s += system_prompt
 
-    for qa in qas:
+    for i, qa in enumerate(qas):
         s += qa["prompt"]
         s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)
 
@@ -62,7 +94,7 @@ def main(args):
         multi_qas,
         temperature=0,
         backend=backend,
-        num_threads=args.parallel,
+        num_threads="auto",
         progress_bar=True,
     )
     latency = time.time() - tic
@@ -75,7 +107,6 @@ def main(args):
         value = {
             "task": "multi_turn_system_prompt_chat",
             "backend": args.backend,
-            "num_gpus": 1,
             "latency": round(latency, 3),
             "num_requests": args.num_qa,
             "num_turns": args.turns,
 
@@ -31,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]
+#For Intel Gaudi(device : hpu) follow the installation guide
+#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
+srt_hpu =  ["sglang[runtime_common]"]
 
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
@@ -46,9 +49,11 @@ test = [
 all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
+all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
+dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
 
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"
 
@@ -278,10 +278,7 @@ def correctness_test(
 
 
 def synchronize(device):
-    if device == "cuda":
-        torch.cuda.synchronize()
-    elif device == "xpu":
-        torch.xpu.synchronize()
+    torch.get_device_module(device).synchronize()
 
 
 def latency_test_run_once(
 
@@ -727,9 +727,9 @@ def sample_generated_shared_prefix_requests(
     total_input_tokens = 0
     total_output_tokens = 0
 
-    for group_idx in range(num_groups):
+    for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
         system_prompt = system_prompts[group_idx]
-        for prompt_idx in range(prompts_per_group):
+        for prompt_idx in tqdm(range(prompts_per_group), desc="Generating questions"):
             question = questions[group_idx * prompts_per_group + prompt_idx]
             full_prompt = f"{system_prompt}\n\n{question}"
             prompt_len = len(tokenizer.encode(full_prompt))
@@ -859,6 +859,7 @@ async def benchmark(
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
+    max_concurrency: Optional[int],
     disable_tqdm: bool,
     extra_request_body: Dict[str, Any],
     profile: bool,
@@ -868,6 +869,15 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
+    # From https://github.com/vllm-project/vllm/pull/9390
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
     test_input = RequestFuncInput(
@@ -913,7 +923,7 @@ async def benchmark(
         )
         tasks.append(
             asyncio.create_task(
-                request_func(request_func_input=request_func_input, pbar=pbar)
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
             )
         )
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -940,6 +950,12 @@ async def benchmark(
     print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
     print("{:<40} {:<10}".format("Backend:", backend))
     print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
+    print(
+        "{:<40} {:<10}".format(
+            "Max reqeuest concurrency:",
+            max_concurrency if max_concurrency else "not set",
+        )
+    )
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
@@ -1003,6 +1019,7 @@ async def benchmark(
             "backend": args.backend,
             "dataset_name": args.dataset_name,
             "request_rate": request_rate,
+            "max_concurrency": max_concurrency,
             "total_input_tokens": metrics.total_input,
             "total_output_tokens": metrics.total_output,
             "total_output_tokens_retokenized": metrics.total_output_retokenized,
@@ -1090,6 +1107,10 @@ def run_benchmark(args_: argparse.Namespace):
     global args
     args = args_
 
+    # Set default value for max_concurrency if not present
+    if not hasattr(args, "max_concurrency"):
+        args.max_concurrency = None
+
     # Set global environments
     set_ulimit()
     random.seed(args.seed)
@@ -1201,6 +1222,7 @@ def run_benchmark(args_: argparse.Namespace):
                 tokenizer=tokenizer,
                 input_requests=input_requests,
                 request_rate=args.request_rate,
+                max_concurrency=args.max_concurrency,
                 disable_tqdm=args.disable_tqdm,
                 extra_request_body=extra_request_body,
                 profile=args.profile,
@@ -1220,6 +1242,7 @@ def run_benchmark(args_: argparse.Namespace):
                     tokenizer=tokenizer,
                     input_requests=input_requests,
                     request_rate=rate,
+                    max_concurrency=args.max_concurrency,
                     disable_tqdm=args.disable_tqdm,
                     extra_request_body=extra_request_body,
                     profile=args.profile,
@@ -1319,6 +1342,19 @@ def set_ulimit(target_soft_limit=65535):
         help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
         "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
     )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
     parser.add_argument("--seed", type=int, default=1, help="The random seed.")
     parser.add_argument(
         "--multi",