From 865233e2565fa4cbb89e806bf371866f4ef9d56f Mon Sep 17 00:00:00 2001 From: Ankur Neog Date: Sat, 23 Nov 2024 09:52:23 +0530 Subject: [PATCH] Add initial support for intel Gaudi accelerators (#2121) --- python/pyproject.toml | 5 +++++ python/sglang/bench_one_batch.py | 5 +---- python/sglang/srt/model_executor/model_runner.py | 5 +++-- python/sglang/srt/server_args.py | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index 42e44e23111..a474264271a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -31,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"] # xpu is not enabled in public vllm and torch whl, # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm srt_xpu = ["sglang[runtime_common]"] +#For Intel Gaudi(device : hpu) follow the installation guide +#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html +srt_hpu = ["sglang[runtime_common]"] openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] @@ -46,9 +49,11 @@ test = [ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] +all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"] dev = ["sglang[all]", "sglang[test]"] dev_hip = ["sglang[all_hip]", "sglang[test]"] dev_xpu = ["sglang[all_xpu]", "sglang[test]"] +dev_hpu = ["sglang[all_hpu]", "sglang[test]"] [project.urls] "Homepage" = "https://github.com/sgl-project/sglang" diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py index ea65b5383fa..9313fbf6f5c 100644 --- a/python/sglang/bench_one_batch.py +++ b/python/sglang/bench_one_batch.py @@ -278,10 +278,7 @@ def correctness_test( def synchronize(device): - if device == "cuda": - torch.cuda.synchronize() - elif device == "xpu": - torch.xpu.synchronize() + torch.get_device_module(device).synchronize() def latency_test_run_once( diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index c2659f5b740..3d5e450a43e 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -176,14 +176,15 @@ def __init__( def init_torch_distributed(self): logger.info("Init torch distributed begin.") # Init torch distributed + torch.get_device_module(self.device).set_device(self.gpu_id) if self.device == "cuda": - torch.cuda.set_device(self.gpu_id) backend = "nccl" # ToDO(liangan1):Just use gloo to bypass the initilization fail # Need to use xccl for xpu backend in the future elif self.device == "xpu": - torch.xpu.set_device(self.gpu_id) backend = "gloo" + elif self.device == "hpu": + backend = "hccl" if not self.server_args.enable_p2p_check: monkey_patch_vllm_p2p_access_check(self.gpu_id) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 6e374dc9272..7d2842e3481 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -306,7 +306,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--device", type=str, default="cuda", - choices=["cuda", "xpu"], + choices=["cuda", "xpu", "hpu"], help="The device type.", ) parser.add_argument(