update

hchings · hchings · commit 06353fbe4162 · 2025-11-02T07:49:58.000Z
diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+import os
 from collections import defaultdict
 from typing import Any, Dict, List, NamedTuple
 
@@ -507,9 +508,14 @@ def report_statistics(self) -> None:
                 f"Max Sequence Length:\t{build_cfg['max_seq_len']}\n"
                 f"\n")
         else:
+            # Check MPI vs RAY and RPC status
+            comm_backend = "RAY" if os.environ.get(
+                "TLLM_DISABLE_MPI") == "1" else "MPI"
+            ray_status = "[RPC]" if os.environ.get(
+                "TLLM_RAY_USE_RPC") == "1" else "[original]"
             backend_info = (
                 "\n\n===========================================================\n"
-                "= PYTORCH BACKEND\n"
+                f"= PYTORCH BACKEND [{comm_backend}] {ray_status}\n"
                 "===========================================================\n"
                 f"Model:\t\t\t{engine['model']}\n"
                 f"Model Path:\t\t{engine['model_path']}\n"
diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py
@@ -13,7 +13,7 @@
 
 from .._torch.pyexecutor.llm_request import LlmResponse
 from .._utils import (global_mpi_rank, global_mpi_size, mpi_comm, mpi_rank,
-                      nvtx_range_debug)
+                      nvtx_range_debug, ray_use_rpc)
 from ..bindings import executor as tllm
 from ..builder import ConfigEncoder, Engine, EngineConfig
 from ..llmapi.llm_args import BaseLlmArgs, PybindMirror
@@ -523,9 +523,8 @@ def _deduce_max_tokens(request: GenerationRequest,
 
     def submit(self, request: GenerationRequest) -> GenerationResult:
         """ Low-level API to the executor. Return a "future" GenerationResult which can be waited. """
-        # TODO Need fix. this is a good way to catch the poor error propogation issue.
-        # e.g., now RayGPUWorker doesn't not define start() but it won't error out and appeaer as hang.
-        # self.start()
+        # TODO Use this to test error propogation issue with RayExecutor.
+        self.start()
 
         if self.rank != 0:
             raise RuntimeError(
diff --git a/tensorrt_llm/executor/ray_executor.py b/tensorrt_llm/executor/ray_executor.py
@@ -89,7 +89,7 @@ def __init__(self,
             if self.use_rpc:
                 self.rpc_addr = get_unique_ipc_addr()
                 self.rpc_client = RPCClient(self.rpc_addr)
-                print(f"RPC client created at {self.rpc_addr}")
+                print(f"====RPC client created at {self.rpc_addr}")
 
                 self._results = {}
                 self._shutdown_event = threading.Event()
@@ -144,6 +144,13 @@ def use_ray_queue(self) -> bool:
     async def _generic_fetch_loop_async(self, fetch_method_name: str,
                                         handler_method, method_name: str):
         # TODO copied from GenerationExecutorRpcProxy, need refactoring.
+        """Generic method for fetching data in a loop from RPC worker.
+
+        Args:
+            fetch_method_name: Name of the RPC client method to call
+            handler_method: The handler method to call with the fetched data
+            method_name: Name of the method for logging
+        """
         try:
             fetch_method = getattr(self.rpc_client, fetch_method_name)
             async for data in fetch_method().remote_streaming():
@@ -169,6 +176,7 @@ async def main_loop_task():
             await self._fetch_responses_loop_async()
 
         def _run_main_loop_task():
+            """Local method to run the main loop task."""
             self.main_loop = asyncio.new_event_loop()
             asyncio.set_event_loop(self.main_loop)
 
@@ -177,12 +185,13 @@ def _run_main_loop_task():
             try:
                 self.main_loop.run_until_complete(self.main_loop_task_obj)
             except asyncio.CancelledError:
-                pass
+                pass  # Task cancellation is expected during shutdown
             finally:
                 self.main_loop.close()
 
         self.main_loop_thread = threading.Thread(target=_run_main_loop_task,
-                                                 daemon=True)
+                                                 daemon=True,
+                                                 name="ray_executor_main_loop")
         self.main_loop_thread.start()
         atexit.register(self.shutdown)
 
@@ -209,6 +218,7 @@ def process_res(res: list):
                 if isinstance(queue, _SyncQueue):
                     queue.put_nowait(r)
                     async_queues.append(queue)
+                    # all the loops are identical
                     event_loop = event_loop or queue.loop
                 else:
                     queue.put(r)
@@ -217,7 +227,9 @@ def process_res(res: list):
                         r, ErrorResponse):
                     self._results.pop(client_id)
 
+        # Handle the case where responses might not be a list of lists
         if responses and not isinstance(responses[0], list):
+            # If responses is a flat list, wrap it
             responses = [responses]
 
         for res in responses:
@@ -314,10 +326,9 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
         if self.use_rpc:
             with nvtx_range_debug("rpc_submit"):
                 self.rpc_client.submit(request).remote(need_response=False)
-                print(
-                    f"[RPC] RayExecutor  submit done for request {request.id}")
+                # print(
+                #     f"[RPC] RayExecutor  submit done for request {request.id}")
 
-            # TODO. use the future return by BaseWorker submit
             result = GenerationResult(
                 request,
                 background_error_handler=self._handle_background_error,
@@ -342,6 +353,9 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
 
         return result
 
+    def start(self):
+        pass
+
     def report_device_ids(self) -> list[str]:
         gpu_ids = self.call_all_ray_workers("report_device_id",
                                             leader_only=False,
@@ -354,7 +368,16 @@ def abort_request(self, request_id: int) -> None:
                                   async_call=False,
                                   request_id=request_id)
 
+    # TODO: Use Ray RPC to shutdown RPC server, and then close client
     def shutdown(self):
+        try:
+            self.shutdown_impl()
+        except Exception as e:
+            # TODO: clean up
+            print(f"Error shutting down RayExecutor: {e}")
+            raise e
+
+    def shutdown_impl(self):
         if self.use_rpc:
             if self._shutdown_event.is_set():
                 return
diff --git a/tensorrt_llm/executor/ray_gpu_worker.py b/tensorrt_llm/executor/ray_gpu_worker.py
@@ -4,12 +4,12 @@
 from pathlib import Path
 from queue import Queue
 from threading import Event
-from typing import Any, Optional, Type, Union
+from typing import Any, AsyncGenerator, Optional, Type, Union
 
 import ray
 import torch
 
-from .._utils import mpi_rank, ray_use_rpc
+from .._utils import nvtx_range_debug, ray_use_rpc
 from ..bindings import executor as tllm
 from ..builder import Engine
 from ..llmapi.llm_args import BaseLlmArgs
@@ -236,53 +236,63 @@ def enqueue_request(self,
                         request: GenerationRequest,
                         result_wait_queue: Queue | None = None) -> int:
         # TODO. remove this. originally we didn't have to handle all the req id dict
+        # raise ValueError("enqueue_request should not be called.")
         return self._enqueue_request(request, result_wait_queue)
 
+    def start(self):
+        pass
+
     def submit(self, request: GenerationRequest):
-        print(f"RayGPUWorker {self.rank} submitted request {request.id}")
         return super().submit(request)
 
-    async def fetch_responses_async(self,
-                                    timeout: Optional[float] = None) -> list:
+    def fetch_responses(self, timeout: Optional[float] = None) -> list:
         # TODO copied from RpcWorker, need refactoring.
-        logger_debug(f"RayGPUWorker {mpi_rank()} is fetching responses async",
+        logger_debug(f"RayGPUWorker {self.rank} is fetching responses",
                      color="yellow")
-
-        responses = await asyncio.to_thread(self.await_responses,
-                                            timeout=timeout)
-        if self._await_response_helper:
+        with nvtx_range_debug("RayGPUWorker.fetch_responses",
+                              color="orange",
+                              category="Worker"):
+            # NOTE: This is a blocking call, it will wait for the responses to be available.
+            responses = super().await_responses(timeout)
             self._await_response_helper.responses_handler(responses)
 
-        if hasattr(self,
-                   '_response_queue') and self._response_queue is not None:
-            qsize = self._response_queue.qsize()
-            logger_debug(f"RayGPUWorker returning {qsize} responses",
-                         color="yellow")
+        qsize = self._response_queue.qsize()
+        logger_debug(f"RayGPUWorker returning {qsize} responses",
+                     color="yellow")
+
+        all_responses = []
+        for _ in range(qsize):
+            # The queue contains batches of responses, so extend the list
+            all_responses.extend(self._response_queue.get())
+        return all_responses
 
-            all_responses = []
-            for _ in range(qsize):
-                all_responses.extend(self._response_queue.get())
-            return all_responses
+    async def fetch_responses_async(self,
+                                    timeout: Optional[float] = None) -> list:
+        # TODO copied from RpcWorker, need refactoring.
+        # A really async version of fetch_responses
+        logger_debug(f"RayGPUWorker {self.rank} is fetching responses async",
+                     color="yellow")
 
-        return responses if responses else []
+        # First, await any pending responses without blocking the event loop
+        responses = await asyncio.to_thread(self.fetch_responses,
+                                            timeout=timeout)
+        return responses
 
     # for streaming performance
-    async def fetch_responses_loop_async(self):
+    async def fetch_responses_loop_async(self) -> AsyncGenerator[list, None]:
         # TODO copied from RpcWorker, need refactoring.
-        shutdown_event = getattr(self, 'shutdown_event', Event())
-
-        while not shutdown_event.is_set():
+        while not self.shutdown_event.is_set():
             responses = await self.fetch_responses_async()
-            if responses:
+            if responses:  # Only yield if there are actual responses
                 logger_debug(
-                    f"RayGPUWorker {mpi_rank()} yielding responses: {responses}",
+                    f"RayGPUWorker {self.rank} is yielding responses: {responses}",
                     color="yellow")
-                yield responses
+                yield responses  # batching the responses to opt IPC performance
             else:
+                # Small delay to prevent busy waiting when no responses
                 await asyncio.sleep(0)
-
         logger_debug(
-            f"RayGPUWorker {mpi_rank()} quitting fetch_responses_loop_async",
+            f"RayGPUWorker {self.rank} quitting fetch_responses_loop_async",
             color="yellow")
 
     def shutdown(self):
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
@@ -20,7 +20,7 @@
     from tensorrt_llm import ray_stub as ray
 
 from .._ray_utils import unwrap_ray_errors
-from .._utils import mpi_disabled, nvtx_range_debug
+from .._utils import mpi_disabled, nvtx_range_debug, ray_use_rpc
 from ..bindings import executor as tllm
 from ..disaggregated_params import DisaggregatedParams
 from ..llmapi.tracer import global_tracer
@@ -171,6 +171,8 @@ class RayAsyncQueue:
     """Ray actor for async response handling."""
 
     def __init__(self):
+        if ray_use_rpc():
+            raise ValueError("RayAsyncQueue should not be used with RPC mode")
         self.data = {}
         self.event_map = {}
         self.warmup_done = False
@@ -215,6 +217,8 @@ class RaySyncQueue:
     """Ray actor for sync response handling."""
 
     def __init__(self):
+        if ray_use_rpc():
+            raise ValueError("RaySyncQueue should not be used with RPC mode")
         self.data = {}
         self.event_map = {}
         self.semaphore = threading.Semaphore(SYNC_QUEUE_MAX_CONCURRENCY - 1)
diff --git a/tensorrt_llm/executor/rpc/rpc_client.py b/tensorrt_llm/executor/rpc/rpc_client.py
diff --git a/tensorrt_llm/executor/rpc/rpc_server.py b/tensorrt_llm/executor/rpc/rpc_server.py