add rpc test list

Superjomn · Superjomn · commit a52402765d86 · 2025-09-29T21:35:29.000+08:00
Signed-off-by: chunweiy &lt;chunweiy@nvidia.com&gt;
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/rpc/README.md b/tensorrt_llm/executor/rpc/README.md
@@ -0,0 +1,85 @@
+# A Lightweight RPC
+This is a pure-Python lightweight RPC we build to simplify our existing IPC code in the orchestrator part. It provides multiple call modes (sync, async, future, streaming) and supports both IPC and TCP connections.
+
+## Examples
+### Create Server and Client
+
+```python
+from tensorrt_llm.executor.rpc import RPCServer, RPCClient
+
+# Define your application
+class App:
+    def add(self, a: int, b: int) -> int:
+        return a + b
+    
+    async def async_multiply(self, x: int, y: int) -> int:
+        return x * y
+
+# Create and start server
+app = App()
+with RPCServer(app) as server:
+    server.bind("ipc:///tmp/my_rpc")  # or "tcp://127.0.0.1:5555"
+    server.start()
+    
+    # Create client and make calls
+    with RPCClient("ipc:///tmp/my_rpc") as client:
+        result = client.add(5, 3).remote()
+        print(result)  # Output: 8
+```
+
+### Different Remote Calls
+
+#### Synchronous Call
+```python
+# Blocking call that waits for result
+result = client.add(10, 20).remote()
+# or with timeout
+result = client.add(10, 20).remote(timeout=5.0)
+```
+
+#### Asynchronous Call
+```python
+# Async call that returns a coroutine
+result = await client.async_multiply(3, 4).remote_async()
+```
+
+#### Future-based Call
+```python
+# Returns a concurrent.futures.Future
+future = client.add(1, 2).remote_future()
+# Get result later
+result = future.result()
+```
+
+#### Fire-and-Forget Call
+```python
+# Send request without waiting for response
+client.submit_task(task_id=123).remote(need_response=False)
+```
+
+#### Streaming Call
+```python
+# For async generator methods
+async for value in client.stream_data(n=10).remote_streaming():
+    print(f"Received: {value}")
+```
+
+### Error Handling
+```python
+from tensorrt_llm.executor.rpc import RPCError, RPCTimeout
+
+try:
+    result = client.risky_operation().remote(timeout=1.0)
+except RPCTimeout:
+    print("Operation timed out")
+except RPCError as e:
+    print(f"RPC Error: {e}")
+    print(f"Original cause: {e.cause}")
+    print(f"Traceback: {e.traceback}")
+```
+
+### Graceful Shutdown
+```python
+# Shutdown server from client
+client.shutdown_server()
+```
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -46,6 +46,8 @@ l0_a10:
   - unittest/llmapi/test_serialization.py
   - unittest/llmapi/test_utils.py
   - unittest/llmapi/test_llm_args.py
+  # executor
+  - unittest/executor/test_rpc.py
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml
@@ -16,6 +16,10 @@ l0_a100:
     - unittest/llmapi/test_llm_pytorch.py
     - unittest/llmapi/test_mpi_session.py # generic tests
     - unittest/trt/model_api/test_model_quantization.py
+    # executor
+    - unittest/executor/test_base_worker.py
+    - unittest/executor/test_rpc_proxy.py
+    - unittest/executor/test_rpc_worker.py
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/unittest/executor/test_base_worker.py b/tests/unittest/executor/test_base_worker.py
@@ -12,6 +12,7 @@
 # isort: off
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..")
 from utils.llm_data import llm_models_root
+from utils.util import skip_single_gpu
 # isort: on
 
 from tensorrt_llm._torch.pyexecutor.config import update_executor_config
@@ -156,6 +157,8 @@ def create_worker_session(self):
         session = MpiPoolSession(n_workers=2)
         return session
 
+    @pytest.mark.gpu2
+    @skip_single_gpu
     def test_create_executor(self):
         futures = self.session.submit(
             TestRpcWorkerBaseTP2.create_executor,
diff --git a/tests/unittest/executor/test_rpc_proxy.py b/tests/unittest/executor/test_rpc_proxy.py
@@ -14,7 +14,7 @@
 # isort: off
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..")
 from utils.llm_data import llm_models_root
-from utils.util import similar
+from utils.util import similar, skip_single_gpu
 # isort: on
 
 model_path = llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
@@ -78,6 +78,8 @@ def test_tp1(self, num_reqs):
             assert isinstance(kv_cache_events, list)
 
     @pytest.mark.parametrize("num_reqs", [1, 10])
+    @skip_single_gpu
+    @pytest.mark.gpu2
     def test_tp2(self, num_reqs):
         tokenizer = TransformersTokenizer.from_pretrained(model_path)
         prompt = "A B C D"
diff --git a/tests/unittest/executor/test_rpc_worker.py b/tests/unittest/executor/test_rpc_worker.py
@@ -18,6 +18,7 @@
 # isort: off
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/..")
 from utils.llm_data import llm_models_root
+from utils.util import skip_single_gpu
 # isort: on
 
 model_path = llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
@@ -215,11 +216,15 @@ def create_worker_session(self):
     def create_rpc_client(self, addr: str):
         return RPCClient(addr)
 
+    @skip_single_gpu
+    @pytest.mark.gpu2
     def test_create_shutdown(self):
         # Invoke setup_engine in rank 0, and that will unblock all the ranks to
         # invoke setup_engine simultaneously.
         pass
 
+    @skip_single_gpu
+    @pytest.mark.gpu2
     def test_fetch_responses_sync(self):
         # Wait a bit to ensure engine is ready
         time.sleep(1)