Add cutlass decode kernel to TritonBench (#4853)

Aya-ZIbra · facebook-github-bot · commit 68081f501014 · 2025-09-22T13:27:33.000-07:00
Summary: Pull Request resolved: #4853 X-link: facebookresearch/FBGEMM#1875 Add cutlass blackwell FMHA decode kernel implementation to TritonBench benchmarking suite . Reviewed By: sryap Differential Revision: D80041532
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu
@@ -304,7 +304,7 @@ at::Tensor dispatch_fmha_gen_fwd(
 
   return DISPATCH_ELEMENT_TYPE(q.scalar_type(), Element, [&] {
     return DISPATCH_KERNEL_TYPE(static_cast<int>(kernel_type), KType, [&] {
-      GenRunner<Element, KType, Shape<_128, _128, _128>, Shape<_1, _1, _1>>
+      GenRunner<Element, KType, Shape<_128, _256, _128>, Shape<_1, _1, _1>>
           runner;
       return runner.fmha_fwd(q, k, v, seqlen_kv, batch_idx);
     });