Skip to content

Commit

Permalink
Update docs (#1839)
Browse files Browse the repository at this point in the history
  • Loading branch information
merrymercy authored Oct 30, 2024
1 parent 539df95 commit b548801
Show file tree
Hide file tree
Showing 11 changed files with 165 additions and 198 deletions.
2 changes: 1 addition & 1 deletion docs/deploy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/python3
# Deploy the documents

import os
from datetime import datetime
Expand Down
Empty file removed docs/deploy_docs.sh
Empty file.
167 changes: 92 additions & 75 deletions docs/openai_api.ipynb

Large diffs are not rendered by default.

131 changes: 51 additions & 80 deletions docs/send_request.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion python/sglang/srt/mem_cache/flush_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@
parser.add_argument("--url", type=str, default="http://localhost:30000")
args = parser.parse_args()

response = requests.get(args.url + "/flush_cache")
response = requests.post(args.url + "/flush_cache")
assert response.status_code == 200
2 changes: 1 addition & 1 deletion python/sglang/srt/model_executor/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def __init__(
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
)
server_args.chunked_prefill_size = None
server_args.mem_fraction_static *= 0.95
self.mem_fraction_static *= 0.95
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
if self.model_config.hf_config.architectures == [
"Qwen2VLForConditionalGeneration"
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/srt/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ async def get_server_args():
return dataclasses.asdict(tokenizer_manager.server_args)


@app.get("/flush_cache")
@app.post("/flush_cache")
async def flush_cache():
"""Flush the radix cache."""
tokenizer_manager.flush_cache()
Expand Down Expand Up @@ -180,7 +180,7 @@ async def get_memory_pool_size():

return ret
except Exception as e:
return JSONResponse(
return ORJSONResponse(
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
)

Expand Down
44 changes: 7 additions & 37 deletions python/sglang/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import numpy as np
import requests
import torch
from IPython.display import HTML, display
from tqdm import tqdm

Expand Down Expand Up @@ -332,14 +331,13 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
headers={"Authorization": "Bearer None"},
)
if response.status_code == 200:
time.sleep(5)
print_highlight(
"""\n
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
"""
Server and notebook outputs are combined for clarity.
Typically, the server runs in a separate terminal.
Server output is gray; notebook output is highlighted.
"""
)
break

Expand All @@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:


def terminate_process(process):
"""Safely terminate a process and clean up GPU memory.
Args:
process: subprocess.Popen object to terminate
"""
try:
process.terminate()
try:
process.wait(timeout=5)
except subprocess.TimeoutExpired:
if os.name != "nt":
try:
pgid = os.getpgid(process.pid)
os.killpg(pgid, signal.SIGTERM)
time.sleep(1)
if process.poll() is None:
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
pass
else:
process.kill()
process.wait()
except Exception as e:
print(f"Warning: {e}")
finally:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
time.sleep(2)
from sglang.srt.utils import kill_child_process
kill_child_process(process.pid, include_self=True)


def print_highlight(html_content: str):
Expand Down
4 changes: 4 additions & 0 deletions scripts/ci_install_dependency.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
"""
Install the dependency in CI.
"""

pip install --upgrade pip
pip install -e "python[all]"
pip install transformers==4.45.2
Expand Down
4 changes: 4 additions & 0 deletions scripts/killall_sglang.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
"""
Kill all SGLang processes and free the GPU memory.
"""

kill -9 $(ps aux | grep 'multiprocessing.spawn' | grep -v 'grep' | awk '{print $2}')
kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}')
3 changes: 2 additions & 1 deletion scripts/version_branch_to_tag.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash

# This script tags all remote branches starting with 'v' with the same name as the branch,
# This script is used for release.
# It tags all remote branches starting with 'v' with the same name as the branch,
# deletes the corresponding branches from the remote, and pushes the tags to the remote repository.

git fetch origin --prune
Expand Down

0 comments on commit b548801

Please sign in to comment.