Add CLI client for shortfin LLM server (#1079)

rsuderman · web-flow · commit 41cf060a9c5e · 2025-03-13T14:36:32.000-07:00
For testing purposes its easier to invoke a single CLI that can batch
run values. This can be reworked to include benchmarking the throughput
and batch processing local files.
diff --git a/shortfin/python/shortfin/interop/fastapi/__init__.py b/shortfin/python/shortfin/interop/fastapi/__init__.py
@@ -8,6 +8,7 @@
 import logging
 
 from shortfin.support.deps import ShortfinDepNotFoundError
+from ...support.responder import AbstractResponder
 
 try:
     from fastapi import Request, Response
@@ -23,7 +24,7 @@
 logger = logging.getLogger(__name__)
 
 
-class FastAPIResponder:
+class FastAPIResponder(AbstractResponder):
     """Bridge between FastAPI and shortfin that can be used to send out of band
     responses back to a waiting FastAPI async request.
 
diff --git a/shortfin/python/shortfin/support/responder.py b/shortfin/python/shortfin/support/responder.py
@@ -0,0 +1,24 @@
+# Copyright 2025 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+class AbstractResponder:
+    """Interface for a responder to"""
+
+    def __init__(self):
+        pass
+
+    def ensure_response(self):
+        pass
+
+    def send_response(self, response):
+        pass
+
+    def start_response(self, **kwargs):
+        pass
+
+    def stream_part(self, content: bytes | None):
+        pass
diff --git a/shortfin/python/shortfin_apps/llm/cli.py b/shortfin/python/shortfin_apps/llm/cli.py
@@ -0,0 +1,190 @@
+# Copyright 2025 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import argparse
+import asyncio
+import json
+import logging
+from pathlib import Path
+import sys
+
+# Import first as it does dep checking and reporting.
+from shortfin import ProgramIsolation
+from shortfin.support.responder import AbstractResponder
+
+from .components.generate import ClientGenerateBatchProcess
+from .components.io_struct import GenerateReqInput
+from .components.lifecycle import ShortfinLlmLifecycleManager
+
+
+logger = logging.getLogger(__name__)
+
+
+def add_input_args(parser):
+    group = parser.add_argument_group("Input Source", "Inputs to select from")
+    group = group.add_mutually_exclusive_group()
+    parser.add_argument("--prompt")
+    parser.add_argument("--prompt-file")
+
+
+def add_service_args(parser):
+    parser.add_argument(
+        "--tokenizer_json",
+        type=Path,
+        required=True,
+        help="Path to a tokenizer.json file",
+    )
+    parser.add_argument(
+        "--tokenizer_config_json",
+        type=Path,
+        required=False,
+        help="Path to a tokenizer_config json file",
+    )
+    parser.add_argument(
+        "--model_config",
+        type=Path,
+        required=True,
+        help="Path to the model config file",
+    )
+    parser.add_argument(
+        "--vmfb",
+        type=Path,
+        required=True,
+        help="Model VMFB to load",
+    )
+    parser.add_argument(
+        "--parameters",
+        type=Path,
+        nargs="*",
+        help="Parameter archives to load (supports: gguf, irpa, safetensors).",
+        metavar="FILE",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        required=True,
+        choices=["local-task", "hip", "amdgpu"],
+        help="Device to serve on; e.g. local-task, hip. Same options as `iree-run-module --device` ",
+    )
+    parser.add_argument(
+        "--device_ids",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a sf device id like amdgpu:0:0@0",
+    )
+    parser.add_argument(
+        "--isolation",
+        type=str,
+        default="per_call",
+        choices=[isolation.name.lower() for isolation in ProgramIsolation],
+        help="Concurrency control -- How to isolate programs.",
+    )
+    parser.add_argument(
+        "--amdgpu_async_allocations",
+        action="store_true",
+        help="Enable asynchronous allocations for amdgpu device contexts.",
+    )
+    parser.add_argument(
+        "--amdgpu_allocators",
+        default=None,
+        help="Allocator to use during VMFB invocation.",
+    )
+    parser.add_argument(
+        "--server_config",
+        type=Path,
+        help="Path to server configuration file",
+    )
+    parser.add_argument(
+        "--prefix_sharing_algorithm",
+        type=str,
+        choices=["none", "trie"],
+        help="Algorithm to use for prefix sharing in KV cache",
+    )
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser()
+    add_service_args(parser)
+    add_input_args(parser)
+
+    return parser.parse_args(argv)
+
+
+def process_inputs(args):
+    if args.prompt:
+        return [args.prompt]
+    return json.load(open(args.prompt_file, "r"))
+
+
+class CliResponder(AbstractResponder):
+    def __init__(self):
+        super().__init__()
+        self._loop = asyncio.get_running_loop()
+        self.response = asyncio.Future(loop=self._loop)
+        self.responded = False
+
+    def ensure_response(self):
+        pass
+
+    def send_response(self, response):
+        assert not self.responded, "Response already sent"
+        if self._loop.is_closed():
+            raise IOError("Web server is shut down")
+        self.responded = True
+        self._loop.call_soon_threadsafe(self.response.set_result, response)
+
+    def start_response(self, **kwargs):
+        raise Exception("Streaming not supported")
+
+    def stream_part(self, content):
+        raise Exception("Streaming not supported")
+
+
+async def main(argv):
+    args = parse_args(argv)
+    if args.tokenizer_config_json is None:
+        # this is only used for the EOS token
+        logging.info("Argument `--tokenizer_config_json` is not provided")
+        logging.info("Inferring tokenizer config path from tokenizer path")
+        inferred_tokenizer_config_path = args.tokenizer_json.with_name(
+            args.tokenizer_json.stem + "_config.json"
+        )
+        args.tokenizer_config_json = inferred_tokenizer_config_path
+
+    logger.info(msg="Setting up service", level=logging.INFO)
+    lifecycle_manager = ShortfinLlmLifecycleManager(args)
+    service = lifecycle_manager.services["default"]
+    service.start()
+
+    sampling_params = {"max_completion_tokens": 5}
+
+    prompts = process_inputs(args)
+
+    responders = []
+    for prompt in prompts:
+        logger.log(msg=f'Submitting request for prompt "{prompt}"', level=logging.INFO)
+        gen_req = GenerateReqInput(text=prompt, sampling_params=sampling_params)
+        responder = CliResponder()
+
+        async def submit():
+            ClientGenerateBatchProcess(service, gen_req, responder).launch()
+            return responder
+
+        await submit()
+        responders.append(responder)
+
+    await asyncio.gather(*[r.response for r in responders])
+
+    for responder in responders:
+        print(responder.response.result().decode())
+
+    logger.log(msg=f"Shutting down service", level=logging.INFO)
+    service.shutdown()
+
+
+if __name__ == "__main__":
+    asyncio.run(main(sys.argv[1:]))
diff --git a/shortfin/python/shortfin_apps/llm/server.py b/shortfin/python/shortfin_apps/llm/server.py
@@ -13,6 +13,8 @@
 
 # Import first as it does dep checking and reporting.
 from shortfin import ProgramIsolation
+from .cli import add_service_args
+
 import uvicorn
 
 from .application import get_app
@@ -51,6 +53,7 @@
 
 def parse_args(argv):
     parser = argparse.ArgumentParser()
+    add_service_args(parser)
     parser.add_argument("--host", type=str, default="0.0.0.0")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument(
@@ -62,80 +65,6 @@ def parse_args(argv):
     parser.add_argument(
         "--timeout-keep-alive", type=int, default=5, help="Keep alive timeout"
     )
-    parser.add_argument(
-        "--tokenizer_json",
-        type=Path,
-        required=True,
-        help="Path to a tokenizer.json file",
-    )
-    parser.add_argument(
-        "--tokenizer_config_json",
-        type=Path,
-        required=False,
-        help="Path to a tokenizer_config json file",
-    )
-    parser.add_argument(
-        "--model_config",
-        type=Path,
-        required=True,
-        help="Path to the model config file",
-    )
-    parser.add_argument(
-        "--vmfb",
-        type=Path,
-        required=True,
-        help="Model VMFB to load",
-    )
-    # parameters are loaded with `iree_io_parameters_module_create`
-    parser.add_argument(
-        "--parameters",
-        type=Path,
-        nargs="*",
-        help="Parameter archives to load (supports: gguf, irpa, safetensors).",
-        metavar="FILE",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        required=True,
-        choices=["local-task", "hip", "amdgpu"],
-        help="Device to serve on; e.g. local-task, hip. Same options as `iree-run-module --device` ",
-    )
-    parser.add_argument(
-        "--device_ids",
-        type=str,
-        nargs="*",
-        default=None,
-        help="Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a sf device id like amdgpu:0:0@0",
-    )
-    parser.add_argument(
-        "--isolation",
-        type=str,
-        default="per_call",
-        choices=[isolation.name.lower() for isolation in ProgramIsolation],
-        help="Concurrency control -- How to isolate programs.",
-    )
-    parser.add_argument(
-        "--amdgpu_async_allocations",
-        action="store_true",
-        help="Enable asynchronous allocations for amdgpu device contexts.",
-    )
-    parser.add_argument(
-        "--amdgpu_allocators",
-        default=None,
-        help="Allocator to use during VMFB invocation.",
-    )
-    parser.add_argument(
-        "--server_config",
-        type=Path,
-        help="Path to server configuration file",
-    )
-    parser.add_argument(
-        "--prefix_sharing_algorithm",
-        type=str,
-        choices=["none", "trie"],
-        help="Algorithm to use for prefix sharing in KV cache",
-    )
     return parser.parse_args(argv)