Skip to content

Commit

Permalink
fix conf
Browse files Browse the repository at this point in the history
  • Loading branch information
Chayenne committed Oct 30, 2024
2 parents 67170ea + a410c30 commit 64e2b9d
Show file tree
Hide file tree
Showing 11 changed files with 434 additions and 21 deletions.
10 changes: 10 additions & 0 deletions 3rdparty/amd/profiling/PROFILING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## Profiling SGLang Infer System with AMD GPUs
This AppNote describes the SGLang profiling technical, code augment and running steps for systems with AMD Instinct GPUs, nevertheless the same procedure may work with Nvidia GPUs too.
Examples and steps are provided in detail, to facilitate easy reproduce and use to localize performance problem towards optimizations.
Two primary methods are covered:
- [RPD](https://github.com/ROCm/rocmProfileData.git)


- [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)


13 changes: 13 additions & 0 deletions 3rdparty/amd/tuning/TUNING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## Tuning SGLang Infer System with AMD GPUs
This AppNote describes the SGLang performance tuning technical, code harness and running steps for systems with AMD Instinct GPUs.
Harness code, examples and steps are provided in detail, to facilitate easy reproduce & use to tune performance towards workloads.
Three primary runtime areas are covered:
- Triton Kernels


- Torch Tunable Ops


- Torch Compile


2 changes: 1 addition & 1 deletion python/sglang/lang/chat_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def get_chat_template_by_model_path(model_path):
"assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
},
style=ChatTemplateStyle.PLAIN,
stop_str=("<|im_end|>"),
stop_str=("<|im_end|>",),
image_token="<|vision_start|><|image_pad|><|vision_end|>",
)
)
Expand Down
10 changes: 9 additions & 1 deletion python/sglang/lang/interpreter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,14 @@ def run_internal(state, program, func_args, func_kwargs, sync):


def run_program(
program, backend, func_args, func_kwargs, default_sampling_para, stream, sync=False
program,
backend,
func_args,
func_kwargs,
default_sampling_para,
stream,
sync=False,
use_thread=True,
):
if hasattr(backend, "endpoint"):
backend = backend.endpoint
Expand All @@ -67,6 +74,7 @@ def run_program(
chat_template=None,
stream=stream,
num_api_spec_tokens=program.num_api_spec_tokens,
use_thread=use_thread,
)
state = ProgramState(stream_executor)

Expand Down
11 changes: 10 additions & 1 deletion python/sglang/lang/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def run(
return_text_in_logprobs: Optional[bool] = None,
stream: bool = False,
backend=None,
use_thread: bool = True,
**kwargs,
):
from sglang.lang.interpreter import run_program
Expand Down Expand Up @@ -195,7 +196,15 @@ def run(
return_text_in_logprobs=return_text_in_logprobs,
)
backend = backend or global_config.default_backend
return run_program(self, backend, args, kwargs, default_sampling_para, stream)
return run_program(
self,
backend,
args,
kwargs,
default_sampling_para,
stream,
use_thread=use_thread,
)

def run_batch(
self,
Expand Down
84 changes: 67 additions & 17 deletions python/sglang/srt/managers/tokenizer_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,9 +539,22 @@ async def get_memory_pool_size(self):
self.create_handle_loop()

req = GetMemPoolSizeReq()
self.send_to_scheduler.send_pyobj(req)
self.mem_pool_size = asyncio.Future()
return await self.mem_pool_size
ret = None

if self.server_args.dp_size == 1:
self.send_to_scheduler.send_pyobj(req)
self.mem_pool_size = asyncio.Future()
res = await self.mem_pool_size
ret = res.size

else: # self.server_args.dp_size > 1
self.send_to_scheduler.send_pyobj(req)
self.mem_pool_size = asyncio.Future()
self.mem_pool_size_tmp = []
res = await self.mem_pool_size
ret = [r.size for r in res]

return ret

async def update_weights(
self, obj: UpdateWeightReqInput, request: Optional[fastapi.Request] = None
Expand All @@ -554,18 +567,43 @@ async def update_weights(
obj.load_format = self.server_args.load_format

if not self.model_update_lock.locked():
async with self.model_update_lock:
# wait for the previous generation requests to finish
while len(self.rid_to_state) > 0:
await asyncio.sleep(0.001)
self.send_to_scheduler.send_pyobj(obj)
self.model_update_result = asyncio.Future()
result = await self.model_update_result
if result.success:
self.server_args.model_path = obj.model_path
self.server_args.load_format = obj.load_format
self.model_path = obj.model_path
return result.success, result.message

if self.server_args.dp_size == 1:
async with self.model_update_lock:
# wait for the previous generation requests to finish
while len(self.rid_to_state) > 0:
await asyncio.sleep(0.001)
self.send_to_scheduler.send_pyobj(obj)
self.model_update_result = asyncio.Future()
result = await self.model_update_result
if result.success:
self.server_args.model_path = obj.model_path
self.server_args.load_format = obj.load_format
self.model_path = obj.model_path
return result.success, result.message

else: # self.server_args.dp_size > 1

# There will be dp_size number of response from the detokenizer
async with self.model_update_lock:
# wait for the previous generation requests to finish
while len(self.rid_to_state) > 0:
await asyncio.sleep(0.001)
self.send_to_scheduler.send_pyobj(obj)
self.model_update_result = asyncio.Future()
self.model_update_tmp = []
result = await self.model_update_result

all_success = all([r.success for r in result])
if all_success is True:
self.server_args.model_path = obj.model_path
self.server_args.load_format = obj.load_format
self.model_path = obj.model_path
all_message = [r.message for r in result]
all_message = " | ".join(all_message)

return all_success, all_message

else:
return False, "Another update is in progress. Please try again later."

Expand Down Expand Up @@ -600,10 +638,22 @@ async def handle_loop(self):
] = await self.recv_from_detokenizer.recv_pyobj()

if isinstance(recv_obj, UpdateWeightReqOutput):
self.model_update_result.set_result(recv_obj)
if self.server_args.dp_size == 1:
self.model_update_result.set_result(recv_obj)
else: # self.server_args.dp_size > 1
self.model_update_tmp.append(recv_obj)
# set future if the all results are recevied
if len(self.model_update_tmp) == self.server_args.dp_size:
self.model_update_result.set_result(self.model_update_tmp)
continue
elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
self.mem_pool_size.set_result(recv_obj)
if self.server_args.dp_size == 1:
self.mem_pool_size.set_result(recv_obj)
else: # self.sever_args.dp_size > 1
self.mem_pool_size_tmp.append(recv_obj)
# set future if the all results are received
if len(self.mem_pool_size_tmp) == self.server_args.dp_size:
self.mem_pool_size.set_result(self.mem_pool_size_tmp)
continue

assert isinstance(
Expand Down
Loading

0 comments on commit 64e2b9d

Please sign in to comment.