From 66d3d30b9b65aacd0cad80d894f087da6c32daa9 Mon Sep 17 00:00:00 2001 From: Jason Xie Date: Fri, 26 Sep 2025 16:48:16 -0700 Subject: [PATCH] Gate invalid triton autotune configs in AOTInductor for GFX95+ Summary: Saw lowering error when lowering models on MI350X with FP8 PyTorch: P1966277532 Issue arises from lack of instruction support for BLOCK_K <= 64 when matrix_instr_nonkdim=16 on GFX95+ Hardware. This was previously patched for FP8 Triton in D81180838, but now error is showing up in AOTI codepaths with FP8 PyTorch. Differential Revision: D83383625 --- fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py index 37d6989408..d5c05b5ebc 100644 --- a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +++ b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py @@ -3846,7 +3846,7 @@ def _should_skip_config(block_k, matrix_instr_nonkdim): """Skip config if BLOCK_K=64 and matrix_instr_nonkdim=16 on GFX95+""" try: return ( - block_k == 64 + block_k <= 64 and matrix_instr_nonkdim == 16 and torch.version.hip is not None and torch.cuda.get_device_capability() >= (9, 5)