From 865233e2565fa4cbb89e806bf371866f4ef9d56f Mon Sep 17 00:00:00 2001
From: Ankur Neog <anneog@habana.ai>
Date: Sat, 23 Nov 2024 09:52:23 +0530
Subject: [PATCH] Add initial support for intel Gaudi accelerators (#2121)

---
 python/pyproject.toml                            | 5 +++++
 python/sglang/bench_one_batch.py                 | 5 +----
 python/sglang/srt/model_executor/model_runner.py | 5 +++--
 python/sglang/srt/server_args.py                 | 2 +-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index 42e44e23111..a474264271a 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -31,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
 # xpu is not enabled in public vllm and torch whl,
 # need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
 srt_xpu = ["sglang[runtime_common]"]
+#For Intel Gaudi(device : hpu) follow the installation guide
+#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
+srt_hpu =  ["sglang[runtime_common]"]
 
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
@@ -46,9 +49,11 @@ test = [
 all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
+all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
 dev = ["sglang[all]", "sglang[test]"]
 dev_hip = ["sglang[all_hip]", "sglang[test]"]
 dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
+dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
 
 [project.urls]
 "Homepage" = "https://github.com/sgl-project/sglang"
diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py
index ea65b5383fa..9313fbf6f5c 100644
--- a/python/sglang/bench_one_batch.py
+++ b/python/sglang/bench_one_batch.py
@@ -278,10 +278,7 @@ def correctness_test(
 
 
 def synchronize(device):
-    if device == "cuda":
-        torch.cuda.synchronize()
-    elif device == "xpu":
-        torch.xpu.synchronize()
+    torch.get_device_module(device).synchronize()
 
 
 def latency_test_run_once(
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index c2659f5b740..3d5e450a43e 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -176,14 +176,15 @@ def __init__(
     def init_torch_distributed(self):
         logger.info("Init torch distributed begin.")
         # Init torch distributed
+        torch.get_device_module(self.device).set_device(self.gpu_id)
         if self.device == "cuda":
-            torch.cuda.set_device(self.gpu_id)
             backend = "nccl"
         # ToDO(liangan1):Just use gloo to bypass the initilization fail
         # Need to use xccl for xpu backend in the future
         elif self.device == "xpu":
-            torch.xpu.set_device(self.gpu_id)
             backend = "gloo"
+        elif self.device == "hpu":
+            backend = "hccl"
 
         if not self.server_args.enable_p2p_check:
             monkey_patch_vllm_p2p_access_check(self.gpu_id)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 6e374dc9272..7d2842e3481 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -306,7 +306,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
             "--device",
             type=str,
             default="cuda",
-            choices=["cuda", "xpu"],
+            choices=["cuda", "xpu", "hpu"],
             help="The device type.",
         )
         parser.add_argument(