fix(cudagraph): pause GC during capture to prevent Triton finalizer abort (#1322) (#1339)

Jasen2201 · web-flow · commit 069399bbe8a3 · 2026-06-25T12:57:11.000+08:00
During CUDAGraph capture, MiniMax-M3's autotuned _topk_index_partial_kernel
discards candidate CompiledKernels. A gen-0 GC firing inside the stream-capture
region runs CompiledKernel.__del__ -&gt; hipModuleUnload, which HIP forbids while a
stream is capturing (HIP 900), corrupting the capture and aborting the
custom_all_reduce IPC handshake (SIGABRT). gc.freeze() did not help because the
discarded kernels are created mid-loop. Disable GC for the whole capture window
and restore via try/finally.
diff --git a/atom/model_engine/model_runner.py b/atom/model_engine/model_runner.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
+import gc
 import logging
 import math
 import os
 import time
-from contextlib import nullcontext
+from contextlib import contextmanager, nullcontext
 from typing import Any, Optional, Union
 
 import numpy as np
@@ -2283,7 +2284,18 @@ def capture_cudagraph(self):
         # TBO graphs don't capture compute_logits, so disable logits_in_graph.
         self.logits_in_graph = self.world_size == 1 and not is_tbo
 
-        with graph_capture() as gc:
+        @contextmanager
+        def pause_gc():
+            # No GC during capture: a finalizer's hipModuleUnload aborts it (HIP 900).
+            gc.collect()
+            gc.disable()
+            try:
+                yield
+            finally:
+                gc.enable()
+                gc.collect()
+
+        with pause_gc(), graph_capture() as capture_ctx:
             capture_range = (
                 tqdm.tqdm(self.graph_bs) if self.rank == 0 else self.graph_bs
             )
@@ -2362,7 +2374,7 @@ def capture_cudagraph(self):
                             input_ids[:num_tokens],
                             positions[:num_tokens],
                             self.graph_pool,
-                            gc.stream,
+                            capture_ctx.stream,
                             output_buffer=outputs[:num_tokens],
                         )
                         graph_aux = None
@@ -2374,7 +2386,9 @@ def capture_cudagraph(self):
                             if self.use_mrope
                             else positions[:num_tokens]
                         )
-                        with torch.cuda.graph(graph, self.graph_pool, stream=gc.stream):
+                        with torch.cuda.graph(
+                            graph, self.graph_pool, stream=capture_ctx.stream
+                        ):
                             model_output = self.model(
                                 input_ids[:num_tokens],
                                 model_positions,