verl-project · Superjomn · Apr 1, 2026 · Mar 9, 2026 · Mar 11, 2026 · Apr 1, 2026
diff --git a/.github/workflows/e2e_ppo_grpo_trainer_trtllm.yml b/.github/workflows/e2e_ppo_grpo_trainer_trtllm.yml
@@ -93,7 +93,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:trtllm1.3.0rc4"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:trtllm1.3.0rc10"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -207,7 +207,7 @@ jobs:
       - name: clean up
         run: |
           rm -rf checkpoints
-  e2e_grpo_trainer_fsdp-vlm:
+  e2e_grpo_trainer_megatron-vlm:
     needs: setup
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 30 # Increase this timeout value as needed
@@ -273,7 +273,7 @@ jobs:
 
   cleanup:
     runs-on: ubuntu-latest
-    needs: [setup, trtllm_unit_tests, e2e_grpo_trainer_fsdp-qwen2, e2e_grpo_trainer_megatron-qwen2, e2e_grpo_trainer_fsdp-vlm]
+    needs: [setup, trtllm_unit_tests, e2e_grpo_trainer_fsdp-qwen2, e2e_grpo_trainer_megatron-qwen2, e2e_grpo_trainer_megatron-vlm]
     if: always()
     steps:
       - id: destroy-runner

diff --git a/docker/Dockerfile.stable.trtllm b/docker/Dockerfile.stable.trtllm
@@ -1,9 +1,14 @@
 # Base image from NGC TensorRT-LLM, which includes a pre-installed TensorRT-LLM.
 # For available images, visit: https://nvidia.github.io/TensorRT-LLM/installation/containers.html
 # Use TRTLLM_BASE_IMAGE to specify the base image (default: release:1.2.0rc6)
-ARG TRTLLM_BASE_IMAGE=nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc4
+ARG TRTLLM_BASE_IMAGE=nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 FROM ${TRTLLM_BASE_IMAGE}
 
+# Install CUDA forward-compat layer so the container can run on hosts with
+# older NVIDIA drivers (requires host driver >= R535).
+RUN apt-get update && apt-get install -y cuda-compat-13-1 && rm -rf /var/lib/apt/lists/*
+ENV LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
+
 
 # ==============================================================================
 # Install Megatron dependencies
@@ -22,18 +27,17 @@ RUN git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git && \
     pushd ${NVSHMEM_DIR}/lib && \
     ln -s libnvshmem_host.so.3 libnvshmem_host.so && \
     popd && \
-    git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git && \
+    git clone -b hybrid-ep https://github.com/deepseek-ai/DeepEP.git && \
     pushd DeepEP && \
-    wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch && \
-    patch -p1 < deepep.patch && \
+    export CPATH=/usr/local/cuda/targets/x86_64-linux/include/cccl:$CPATH && \
     TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install && \
     popd && rm -rf deepep
 
 # Install Python dependencies
-RUN pip3 install --no-cache-dir --no-deps trl && \
+RUN pip3 install --no-cache-dir --no-deps trl==0.27.0 && \
     pip3 install --no-cache-dir nvtx matplotlib liger_kernel cachetools && \
     pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git && \
-    pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0
+    pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.16.0
 
 
 # ==============================================================================

@@ -1,7 +1,7 @@
 TensorRT-LLM Backend
 ====================
 
-Last updated: 12/31/2025.
+Last updated: 4/2/2026.
 
 **Authored By TensorRT-LLM Team**
 
@@ -14,7 +14,7 @@ The TensorRT-LLM rollout engine primarily targets the colocated mode. Instead of
 
 Installation
 ------------
-We provide ``docker/Dockerfile.stable.trtllm`` for building a docker image with TensorRT-LLM pre-installed. The verl integration is supported from ``nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6``, and you can choose other TensorRT-LLM versions via ``TRTLLM_BASE_IMAGE`` from the `NGC Catalog <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release>`_.
+We provide `docker/Dockerfile.stable.trtllm <https://github.com/verl-project/verl/blob/main/docker/Dockerfile.stable.trtllm>`_ for building a docker image with TensorRT-LLM pre-installed. The verl integration is supported from ``nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6``, and you can choose other TensorRT-LLM versions via ``TRTLLM_BASE_IMAGE`` from the `NGC Catalog <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release>`_.
 
 Alternatively, refer to the `TensorRT-LLM installation guide <https://nvidia.github.io/TensorRT-LLM/installation/index.html>`_ for compatible environments if you want to build your own.
 
@@ -51,12 +51,7 @@ We provide the following GRPO recipe scripts for you to test the performance and
 Using TensorRT-LLM as the Rollout Engine for DAPO
 -------------------------------------------------
 
-We provide a DAPO recipe script ``recipe/dapo/test_dapo_7b_math_trtllm.sh``.
-
 .. code-block:: bash
 
-    ## For FSDP training engine
-    bash recipe/dapo/test_dapo_7b_math_trtllm.sh
-    ## For Megatron-Core training engine
-    TRAIN_ENGINE=megatron bash recipe/dapo/test_dapo_7b_math_trtllm.sh
-
+    # For Megatron-Core training engine with FP8 rollout
+    bash examples/grpo_trainer/run_qwen3-30b_dapo_megatron_fp8_trtllm.sh
diff --git a/verl/workers/rollout/trtllm_rollout/trtllm_async_server.py b/verl/workers/rollout/trtllm_rollout/trtllm_async_server.py
@@ -117,6 +117,12 @@ def get_server_address(self):
     async def launch_server(self):
         from tensorrt_llm import AsyncLLM
         from tensorrt_llm.llmapi import CapacitySchedulerPolicy, CudaGraphConfig, KvCacheConfig, SchedulerConfig
+
+        try:
+            from tensorrt_llm.llmapi.llm_args import ExecutorMemoryType, SleepConfig
+        except ImportError:
+            ExecutorMemoryType = None
+            SleepConfig = None
         from tensorrt_llm.serve import OpenAIServer
 
         assert self.config.pipeline_model_parallel_size == 1, "pipeline_model_parallel_size > 1 is not supported yet"
@@ -164,7 +170,14 @@ async def launch_server(self):
             "placement_groups": self.pgs,
             "placement_bundle_indices": self.bundle_indices,
             "per_worker_gpu_share": per_worker_gpu_share,
-            "enable_sleep": self.config.enable_sleep_mode,
+            "sleep_config": SleepConfig(
+                restore_modes={
+                    ExecutorMemoryType.MODEL_WEIGHTS_MAIN: "NONE",
+                    ExecutorMemoryType.KV_CACHE: "NONE",
+                }
+            )
+            if self.config.enable_sleep_mode and SleepConfig is not None
+            else None,
             "allreduce_strategy": "NCCL",
             "sampler_type": "TRTLLMSampler",
             **engine_kwargs,
@@ -348,8 +361,8 @@ def get_pgs_and_bundle_indices(self) -> tuple[list[PlacementGroup], list[list[in
             local_bundle_index = self.world_size * self.replica_rank
 
         while local_bundle_index >= self.resource_pool.pgs[start_pg_index].bundle_count:
-            start_pg_index += 1
             local_bundle_index -= self.resource_pool.pgs[start_pg_index].bundle_count
+            start_pg_index += 1
         assert (
             start_pg_index < len(self.resource_pool.pgs)
             and local_bundle_index < self.resource_pool.pgs[start_pg_index].bundle_count
@@ -386,7 +399,6 @@ def get_pgs_and_bundle_indices(self) -> tuple[list[PlacementGroup], list[list[in
         return pgs, bundle_indices
 
     async def launch_servers(self):
-        assert self.nnodes == 1, "TRTLLMReplica doesn't support multiple nodes for single replica yet."
         assert self.resource_pool.pgs is not None, "placement groups are not initialized"
 
         pgs, bundle_indices = self.get_pgs_and_bundle_indices()

diff --git a/verl/workers/rollout/trtllm_rollout/trtllm_rollout.py b/verl/workers/rollout/trtllm_rollout/trtllm_rollout.py
@@ -29,6 +29,11 @@
 import ray
 import torch
 import torch.distributed as dist
+
+try:
+    from tensorrt_llm.llmapi.llm_args import ExecutorMemoryType
+except ImportError:
+    ExecutorMemoryType = None
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.multiprocessing.reductions import reduce_tensor
 
@@ -260,16 +265,25 @@ async def update_weights(self, weights: dict[str, str]):
 
 
 class ServerAdapter(BaseRollout):
-    _WEIGHTS_TAGS = [
-        "sampler",
-        "drafter",
-        "guided_decoder",
-        "spec_resource_manager",
-        "model_extra",
-        "executor_extra",
-        "model",
-        "draft_model",
-    ]
+    # All releasable/resumable weight tags: every ExecutorMemoryType except kv_cache
+    # (handled separately) and internal tags prefixed with "_".
+    # Fallback to hard-coded list for trtllm versions that don't export ExecutorMemoryType.
+    _WEIGHTS_TAGS = (
+        [t.value for t in ExecutorMemoryType if t is not ExecutorMemoryType.KV_CACHE and not t.value.startswith("_")]
+        if ExecutorMemoryType is not None
+        else [
+            "sampler",
+            "drafter",
+            "guided_decoder",
+            "spec_resource_manager",
+            "model_extra",
+            "executor_extra",
+            "model",
+            "model_weights",
+            "draft_model",
+            "draft_model_weights",
+        ]
+    )
 
     @staticmethod
     def get_full_tags() -> list[str]: