Skip to content

Commit 41cf060

Browse files
authoredMar 13, 2025
Add CLI client for shortfin LLM server (#1079)
For testing purposes its easier to invoke a single CLI that can batch run values. This can be reworked to include benchmarking the throughput and batch processing local files.
1 parent b828a47 commit 41cf060

File tree

4 files changed

+219
-75
lines changed

4 files changed

+219
-75
lines changed
 

‎shortfin/python/shortfin/interop/fastapi/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import logging
99

1010
from shortfin.support.deps import ShortfinDepNotFoundError
11+
from ...support.responder import AbstractResponder
1112

1213
try:
1314
from fastapi import Request, Response
@@ -23,7 +24,7 @@
2324
logger = logging.getLogger(__name__)
2425

2526

26-
class FastAPIResponder:
27+
class FastAPIResponder(AbstractResponder):
2728
"""Bridge between FastAPI and shortfin that can be used to send out of band
2829
responses back to a waiting FastAPI async request.
2930
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright 2025 Advanced Micro Devices, Inc.
2+
#
3+
# Licensed under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
7+
8+
class AbstractResponder:
9+
"""Interface for a responder to"""
10+
11+
def __init__(self):
12+
pass
13+
14+
def ensure_response(self):
15+
pass
16+
17+
def send_response(self, response):
18+
pass
19+
20+
def start_response(self, **kwargs):
21+
pass
22+
23+
def stream_part(self, content: bytes | None):
24+
pass
+190
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# Copyright 2025 Advanced Micro Devices, Inc.
2+
#
3+
# Licensed under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
7+
import argparse
8+
import asyncio
9+
import json
10+
import logging
11+
from pathlib import Path
12+
import sys
13+
14+
# Import first as it does dep checking and reporting.
15+
from shortfin import ProgramIsolation
16+
from shortfin.support.responder import AbstractResponder
17+
18+
from .components.generate import ClientGenerateBatchProcess
19+
from .components.io_struct import GenerateReqInput
20+
from .components.lifecycle import ShortfinLlmLifecycleManager
21+
22+
23+
logger = logging.getLogger(__name__)
24+
25+
26+
def add_input_args(parser):
27+
group = parser.add_argument_group("Input Source", "Inputs to select from")
28+
group = group.add_mutually_exclusive_group()
29+
parser.add_argument("--prompt")
30+
parser.add_argument("--prompt-file")
31+
32+
33+
def add_service_args(parser):
34+
parser.add_argument(
35+
"--tokenizer_json",
36+
type=Path,
37+
required=True,
38+
help="Path to a tokenizer.json file",
39+
)
40+
parser.add_argument(
41+
"--tokenizer_config_json",
42+
type=Path,
43+
required=False,
44+
help="Path to a tokenizer_config json file",
45+
)
46+
parser.add_argument(
47+
"--model_config",
48+
type=Path,
49+
required=True,
50+
help="Path to the model config file",
51+
)
52+
parser.add_argument(
53+
"--vmfb",
54+
type=Path,
55+
required=True,
56+
help="Model VMFB to load",
57+
)
58+
parser.add_argument(
59+
"--parameters",
60+
type=Path,
61+
nargs="*",
62+
help="Parameter archives to load (supports: gguf, irpa, safetensors).",
63+
metavar="FILE",
64+
)
65+
parser.add_argument(
66+
"--device",
67+
type=str,
68+
required=True,
69+
choices=["local-task", "hip", "amdgpu"],
70+
help="Device to serve on; e.g. local-task, hip. Same options as `iree-run-module --device` ",
71+
)
72+
parser.add_argument(
73+
"--device_ids",
74+
type=str,
75+
nargs="*",
76+
default=None,
77+
help="Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a sf device id like amdgpu:0:0@0",
78+
)
79+
parser.add_argument(
80+
"--isolation",
81+
type=str,
82+
default="per_call",
83+
choices=[isolation.name.lower() for isolation in ProgramIsolation],
84+
help="Concurrency control -- How to isolate programs.",
85+
)
86+
parser.add_argument(
87+
"--amdgpu_async_allocations",
88+
action="store_true",
89+
help="Enable asynchronous allocations for amdgpu device contexts.",
90+
)
91+
parser.add_argument(
92+
"--amdgpu_allocators",
93+
default=None,
94+
help="Allocator to use during VMFB invocation.",
95+
)
96+
parser.add_argument(
97+
"--server_config",
98+
type=Path,
99+
help="Path to server configuration file",
100+
)
101+
parser.add_argument(
102+
"--prefix_sharing_algorithm",
103+
type=str,
104+
choices=["none", "trie"],
105+
help="Algorithm to use for prefix sharing in KV cache",
106+
)
107+
108+
109+
def parse_args(argv):
110+
parser = argparse.ArgumentParser()
111+
add_service_args(parser)
112+
add_input_args(parser)
113+
114+
return parser.parse_args(argv)
115+
116+
117+
def process_inputs(args):
118+
if args.prompt:
119+
return [args.prompt]
120+
return json.load(open(args.prompt_file, "r"))
121+
122+
123+
class CliResponder(AbstractResponder):
124+
def __init__(self):
125+
super().__init__()
126+
self._loop = asyncio.get_running_loop()
127+
self.response = asyncio.Future(loop=self._loop)
128+
self.responded = False
129+
130+
def ensure_response(self):
131+
pass
132+
133+
def send_response(self, response):
134+
assert not self.responded, "Response already sent"
135+
if self._loop.is_closed():
136+
raise IOError("Web server is shut down")
137+
self.responded = True
138+
self._loop.call_soon_threadsafe(self.response.set_result, response)
139+
140+
def start_response(self, **kwargs):
141+
raise Exception("Streaming not supported")
142+
143+
def stream_part(self, content):
144+
raise Exception("Streaming not supported")
145+
146+
147+
async def main(argv):
148+
args = parse_args(argv)
149+
if args.tokenizer_config_json is None:
150+
# this is only used for the EOS token
151+
logging.info("Argument `--tokenizer_config_json` is not provided")
152+
logging.info("Inferring tokenizer config path from tokenizer path")
153+
inferred_tokenizer_config_path = args.tokenizer_json.with_name(
154+
args.tokenizer_json.stem + "_config.json"
155+
)
156+
args.tokenizer_config_json = inferred_tokenizer_config_path
157+
158+
logger.info(msg="Setting up service", level=logging.INFO)
159+
lifecycle_manager = ShortfinLlmLifecycleManager(args)
160+
service = lifecycle_manager.services["default"]
161+
service.start()
162+
163+
sampling_params = {"max_completion_tokens": 5}
164+
165+
prompts = process_inputs(args)
166+
167+
responders = []
168+
for prompt in prompts:
169+
logger.log(msg=f'Submitting request for prompt "{prompt}"', level=logging.INFO)
170+
gen_req = GenerateReqInput(text=prompt, sampling_params=sampling_params)
171+
responder = CliResponder()
172+
173+
async def submit():
174+
ClientGenerateBatchProcess(service, gen_req, responder).launch()
175+
return responder
176+
177+
await submit()
178+
responders.append(responder)
179+
180+
await asyncio.gather(*[r.response for r in responders])
181+
182+
for responder in responders:
183+
print(responder.response.result().decode())
184+
185+
logger.log(msg=f"Shutting down service", level=logging.INFO)
186+
service.shutdown()
187+
188+
189+
if __name__ == "__main__":
190+
asyncio.run(main(sys.argv[1:]))

‎shortfin/python/shortfin_apps/llm/server.py

+3-74
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
# Import first as it does dep checking and reporting.
1515
from shortfin import ProgramIsolation
16+
from .cli import add_service_args
17+
1618
import uvicorn
1719

1820
from .application import get_app
@@ -51,6 +53,7 @@
5153

5254
def parse_args(argv):
5355
parser = argparse.ArgumentParser()
56+
add_service_args(parser)
5457
parser.add_argument("--host", type=str, default="0.0.0.0")
5558
parser.add_argument("--port", type=int, default=8000)
5659
parser.add_argument(
@@ -62,80 +65,6 @@ def parse_args(argv):
6265
parser.add_argument(
6366
"--timeout-keep-alive", type=int, default=5, help="Keep alive timeout"
6467
)
65-
parser.add_argument(
66-
"--tokenizer_json",
67-
type=Path,
68-
required=True,
69-
help="Path to a tokenizer.json file",
70-
)
71-
parser.add_argument(
72-
"--tokenizer_config_json",
73-
type=Path,
74-
required=False,
75-
help="Path to a tokenizer_config json file",
76-
)
77-
parser.add_argument(
78-
"--model_config",
79-
type=Path,
80-
required=True,
81-
help="Path to the model config file",
82-
)
83-
parser.add_argument(
84-
"--vmfb",
85-
type=Path,
86-
required=True,
87-
help="Model VMFB to load",
88-
)
89-
# parameters are loaded with `iree_io_parameters_module_create`
90-
parser.add_argument(
91-
"--parameters",
92-
type=Path,
93-
nargs="*",
94-
help="Parameter archives to load (supports: gguf, irpa, safetensors).",
95-
metavar="FILE",
96-
)
97-
parser.add_argument(
98-
"--device",
99-
type=str,
100-
required=True,
101-
choices=["local-task", "hip", "amdgpu"],
102-
help="Device to serve on; e.g. local-task, hip. Same options as `iree-run-module --device` ",
103-
)
104-
parser.add_argument(
105-
"--device_ids",
106-
type=str,
107-
nargs="*",
108-
default=None,
109-
help="Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a sf device id like amdgpu:0:0@0",
110-
)
111-
parser.add_argument(
112-
"--isolation",
113-
type=str,
114-
default="per_call",
115-
choices=[isolation.name.lower() for isolation in ProgramIsolation],
116-
help="Concurrency control -- How to isolate programs.",
117-
)
118-
parser.add_argument(
119-
"--amdgpu_async_allocations",
120-
action="store_true",
121-
help="Enable asynchronous allocations for amdgpu device contexts.",
122-
)
123-
parser.add_argument(
124-
"--amdgpu_allocators",
125-
default=None,
126-
help="Allocator to use during VMFB invocation.",
127-
)
128-
parser.add_argument(
129-
"--server_config",
130-
type=Path,
131-
help="Path to server configuration file",
132-
)
133-
parser.add_argument(
134-
"--prefix_sharing_algorithm",
135-
type=str,
136-
choices=["none", "trie"],
137-
help="Algorithm to use for prefix sharing in KV cache",
138-
)
13968
return parser.parse_args(argv)
14069

14170

0 commit comments

Comments
 (0)
Please sign in to comment.