Update docs (#1839)

sgl-project · Oct 30, 2024 · b548801 · b548801
1 parent 539df95
commit b548801
Show file tree

Hide file tree

Showing 11 changed files with 165 additions and 198 deletions.
diff --git a/docs/deploy.py b/docs/deploy.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+# Deploy the documents
 
 import os
 from datetime import datetime

diff --git a/docs/deploy_docs.sh b/docs/deploy_docs.sh
diff --git a/docs/openai_api.ipynb b/docs/openai_api.ipynb
diff --git a/docs/send_request.ipynb b/docs/send_request.ipynb
diff --git a/python/sglang/srt/mem_cache/flush_cache.py b/python/sglang/srt/mem_cache/flush_cache.py
@@ -29,5 +29,5 @@
     parser.add_argument("--url", type=str, default="http://localhost:30000")
     args = parser.parse_args()
 
-    response = requests.get(args.url + "/flush_cache")
+    response = requests.post(args.url + "/flush_cache")
     assert response.status_code == 200
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
@@ -124,7 +124,7 @@ def __init__(
                 "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
             )
             server_args.chunked_prefill_size = None
-            server_args.mem_fraction_static *= 0.95
+            self.mem_fraction_static *= 0.95
             # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
             if self.model_config.hf_config.architectures == [
                 "Qwen2VLForConditionalGeneration"

diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
@@ -139,7 +139,7 @@ async def get_server_args():
     return dataclasses.asdict(tokenizer_manager.server_args)
 
 
-@app.get("/flush_cache")
+@app.post("/flush_cache")
 async def flush_cache():
     """Flush the radix cache."""
     tokenizer_manager.flush_cache()
@@ -180,7 +180,7 @@ async def get_memory_pool_size():
 
         return ret
     except Exception as e:
-        return JSONResponse(
+        return ORJSONResponse(
             {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
         )
 

diff --git a/python/sglang/utils.py b/python/sglang/utils.py
@@ -19,7 +19,6 @@
 
 import numpy as np
 import requests
-import torch
 from IPython.display import HTML, display
 from tqdm import tqdm
 
@@ -332,14 +331,13 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                 headers={"Authorization": "Bearer None"},
             )
             if response.status_code == 200:
+                time.sleep(5)
                 print_highlight(
+                    """\n
+                    NOTE: Typically, the server runs in a separate terminal.
+                    In this notebook, we run the server and notebook code together, so their outputs are combined.
+                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
                     """
-                            Server and notebook outputs are combined for clarity.
-                            
-                            Typically, the server runs in a separate terminal.
-                            
-                            Server output is gray; notebook output is highlighted.
-                            """
                 )
                 break
 
@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
 
 
 def terminate_process(process):
-    """Safely terminate a process and clean up GPU memory.
-
-    Args:
-        process: subprocess.Popen object to terminate
-    """
-    try:
-        process.terminate()
-        try:
-            process.wait(timeout=5)
-        except subprocess.TimeoutExpired:
-            if os.name != "nt":
-                try:
-                    pgid = os.getpgid(process.pid)
-                    os.killpg(pgid, signal.SIGTERM)
-                    time.sleep(1)
-                    if process.poll() is None:
-                        os.killpg(pgid, signal.SIGKILL)
-                except ProcessLookupError:
-                    pass
-            else:
-                process.kill()
-            process.wait()
-    except Exception as e:
-        print(f"Warning: {e}")
-    finally:
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-        time.sleep(2)
+    from sglang.srt.utils import kill_child_process
+    kill_child_process(process.pid, include_self=True)
 
 
 def print_highlight(html_content: str):

diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh
@@ -1,3 +1,7 @@
+"""
+Install the dependency in CI.
+"""
+
 pip install --upgrade pip
 pip install -e "python[all]"
 pip install transformers==4.45.2

diff --git a/scripts/killall_sglang.sh b/scripts/killall_sglang.sh
@@ -1,2 +1,6 @@
+"""
+Kill all SGLang processes and free the GPU memory.
+"""
+
 kill -9 $(ps aux | grep 'multiprocessing.spawn' | grep -v 'grep' | awk '{print $2}')
 kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}')
diff --git a/scripts/version_branch_to_tag.sh b/scripts/version_branch_to_tag.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-# This script tags all remote branches starting with 'v' with the same name as the branch, 
+# This script is used for release.
+# It tags all remote branches starting with 'v' with the same name as the branch, 
 # deletes the corresponding branches from the remote, and pushes the tags to the remote repository. 
 
 git fetch origin --prune